From f1b708c4e29a392d84d69f820bcc45bfd89cc221 Mon Sep 17 00:00:00 2001
From: Tom Savage <tcsavage@gmail.com>
Date: Mon, 16 Sep 2024 09:04:31 +0100
Subject: [PATCH 01/50] Fixes detection of CuPy installed with pre-built wheels
 (#1965)

The CuPy library ships both a source distribution (`cupy`) as well as versions containing pre-built wheels (`cupy-cuda11x`, `cupy-cuda12x`, `cupy-rocm-5-0`, `cupy-rocm-4-3`). Use of `_is_package_available` to detect CuPy only works for the source distribution of CuPy and fails when using the pre-built wheels versions.

This is because the `_is_package_available` will always attempt to resolve version information (even if it's not required) and in doing so assumes that the _importable_ package name matches the _installed_ distribution name. While this is usually the case, it doesn't work for CuPy and several other libraries. ONNX Runtime for example might be installed as `onnxruntime` or `onnxruntime-gpu` and thus Optimum just uses `importlib.util.find_spec` to work around the same problem. This commit replicates the same solution for CuPy.
---
 optimum/onnxruntime/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index ad40af92b9d..985980e31b0 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 """Utility functions, classes and constants for ONNX Runtime."""
 
+import importlib
 import os
 import re
 from enum import Enum
@@ -31,7 +32,6 @@
 import onnxruntime as ort
 
 from ..exporters.onnx import OnnxConfig, OnnxConfigWithLoss
-from ..utils.import_utils import _is_package_available
 
 
 if TYPE_CHECKING:
@@ -91,9 +91,11 @@ def is_onnxruntime_training_available():
 
 def is_cupy_available():
     """
-    Checks if onnxruntime-training is available.
+    Checks if CuPy is available.
     """
-    return _is_package_available("cupy")
+    # Don't use _is_package_available as it doesn't work with CuPy installed
+    # with `cupy-cuda*` and `cupy-rocm-*` package name (prebuilt wheels).
+    return importlib.util.find_spec("cupy") is not None
 
 
 class ORTConfigManager:

From ca36fc4f66577cd4ac2e6cedcc204d830a1f4985 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:08:34 +0200
Subject: [PATCH 02/50] Adding `ORTPipelineForxxx` entrypoints (#1960)

* created auto task mappings

* added correct auto classes

* created auto task mappings

* added correct auto classes

* added ort/auto diffusion classes

* fix ORTPipeline detection

* start test refactoring

* dynamic dtype

* support torch random numbers generator

* compact diffusion testing suite

* fix

* test

* test

* test

* use latent-consistency architecture name instead of lcm

* fix

* add ort diffusion pipeline tests

* added dummy objects

* remove duplicate code

* support testing without diffusers

* remove unnecessary

* revert

* style

* remove model parts from optimum.onnxruntime
---
 optimum/exporters/tasks.py                    |   2 +-
 optimum/modeling_base.py                      |   9 +-
 optimum/onnxruntime/__init__.py               |  16 +
 optimum/onnxruntime/base.py                   |  50 +-
 optimum/onnxruntime/modeling_diffusion.py     | 338 ++++++--
 optimum/onnxruntime/modeling_seq2seq.py       |  68 --
 .../diffusers/pipeline_latent_consistency.py  |   6 +-
 .../diffusers/pipeline_stable_diffusion.py    |  16 +-
 .../pipeline_stable_diffusion_img2img.py      |  83 +-
 .../pipeline_stable_diffusion_inpaint.py      |  22 +-
 .../diffusers/pipeline_stable_diffusion_xl.py |  20 +-
 .../pipeline_stable_diffusion_xl_img2img.py   |  28 +-
 optimum/pipelines/diffusers/pipeline_utils.py |   8 +-
 optimum/utils/dummy_diffusers_objects.py      |  44 +
 tests/exporters/exporters_utils.py            |   2 +-
 tests/onnxruntime/test_diffusion.py           | 793 ++++++++++++++++++
 tests/onnxruntime/test_modeling.py            |  47 +-
 .../test_stable_diffusion_pipeline.py         | 562 -------------
 18 files changed, 1287 insertions(+), 827 deletions(-)
 create mode 100644 tests/onnxruntime/test_diffusion.py
 delete mode 100644 tests/onnxruntime/test_stable_diffusion_pipeline.py

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 97053040879..a489f34fb06 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -308,9 +308,9 @@ class TasksManager:
         "image-feature-extraction": "feature-extraction",
         # for backward compatibility and testing (where
         # model task and model type are still the same)
-        "lcm": "text-to-image",
         "stable-diffusion": "text-to-image",
         "stable-diffusion-xl": "text-to-image",
+        "latent-consistency": "text-to-image",
     }
 
     _CUSTOM_CLASSES = {
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 5bab0622de4..3da2d9d0d21 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -85,7 +85,6 @@ class PreTrainedModel(ABC):  # noqa: F811
 
 class OptimizedModel(PreTrainedModel):
     config_class = AutoConfig
-    load_tf_weights = None
     base_model_prefix = "optimized_model"
     config_name = CONFIG_NAME
 
@@ -378,10 +377,14 @@ def from_pretrained(
                 )
             model_id, revision = model_id.split("@")
 
-        library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token)
+        library_name = TasksManager.infer_library_from_model(
+            model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+        )
 
         if library_name == "timm":
-            config = PretrainedConfig.from_pretrained(model_id, subfolder, revision)
+            config = PretrainedConfig.from_pretrained(
+                model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         if config is None:
             if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index f1d4f63a9ff..09a48ec955c 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -79,6 +79,10 @@
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        "ORTPipelineForText2Image",
+        "ORTDiffusionPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
@@ -88,6 +92,10 @@
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        "ORTPipelineForText2Image",
+        "ORTDiffusionPipeline",
     ]
 
 
@@ -137,7 +145,11 @@
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_diffusers_objects import (
+            ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
@@ -146,7 +158,11 @@
         )
     else:
         from .modeling_diffusion import (
+            ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index d9877670ba8..0e54bafed78 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -41,17 +41,11 @@ class ORTModelPart:
     _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs
     _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs
 
-    def __init__(
-        self,
-        session: InferenceSession,
-        parent_model: "ORTModel",
-    ):
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
         self.session = session
         self.parent_model = parent_model
-        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            self.parent_model.config.model_type
-        )(self.parent_model.config)
         self.main_input_name = self.parent_model.main_input_name
+
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()}
@@ -90,12 +84,18 @@ class ORTEncoder(ORTModelPart):
     Encoder part of the encoder-decoder model for ONNX Runtime inference.
     """
 
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor,
-        **kwargs,
-    ) -> BaseModelOutput:
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
+        super().__init__(session, parent_model)
+
+        config = (
+            self.parent_model.config.encoder
+            if hasattr(self.parent_model.config, "encoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput:
         use_torch = isinstance(input_ids, torch.Tensor)
         self.parent_model.raise_on_numpy_input_io_binding(use_torch)
 
@@ -138,6 +138,14 @@ def __init__(
     ):
         super().__init__(session, parent_model)
 
+        config = (
+            self.parent_model.config.decoder
+            if hasattr(self.parent_model.config, "decoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
         # TODO: make this less hacky.
         self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)]
         self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)]
@@ -153,11 +161,7 @@ def __init__(
 
         self.use_past_in_outputs = len(self.key_value_output_names) > 0
         self.use_past_in_inputs = len(self.key_value_input_names) > 0
-        self.use_fp16 = False
-        for inp in session.get_inputs():
-            if "past_key_values" in inp.name and inp.type == "tensor(float16)":
-                self.use_fp16 = True
-                break
+        self.use_fp16 = self.dtype == torch.float16
 
         # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2
         # can be used but do not support KV caching for the cross-attention key/values, see:
@@ -461,11 +465,3 @@ def prepare_inputs_for_merged(
                 cache_position = cache_position.to(self.device)
 
         return use_cache_branch_tensor, past_key_values, cache_position
-
-
-class ORTDecoder(ORTDecoderForSeq2Seq):
-    def __init__(self, *args, **kwargs):
-        logger.warning(
-            "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead."
-        )
-        super().__init__(*args, **kwargs)
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 4bbfb2eda2a..18cd38c5f29 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -17,7 +17,7 @@
 import os
 import shutil
 import warnings
-from abc import abstractmethod
+from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, Optional, Union
@@ -25,18 +25,28 @@
 import numpy as np
 import torch
 from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ConfigMixin,
     DDIMScheduler,
+    LatentConsistencyModelPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
+from transformers.modeling_outputs import ModelOutput
 
 import onnxruntime as ort
 
@@ -56,9 +66,10 @@
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
+from .base import ORTModelPart
+from .io_binding import TypeHelper
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .utils import (
-    _ORT_TO_NP_TYPE,
     ONNX_WEIGHTS_NAME,
     get_provider_for_device,
     parse_device,
@@ -69,23 +80,23 @@
 logger = logging.getLogger(__name__)
 
 
-class ORTStableDiffusionPipelineBase(ORTModel):
-    auto_model_class = StableDiffusionPipeline
-    main_input_name = "input_ids"
-    base_model_prefix = "onnx_model"
+class ORTPipeline(ORTModel):
+    auto_model_class = None
+    model_type = "onnx_pipeline"
+
     config_name = "model_index.json"
     sub_component_config_name = "config.json"
 
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
-        text_encoder_session: ort.InferenceSession,
         unet_session: ort.InferenceSession,
-        config: Dict[str, Any],
         tokenizer: CLIPTokenizer,
+        config: Dict[str, Any],
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         feature_extractor: Optional[CLIPFeatureExtractor] = None,
         vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_2_session: Optional[ort.InferenceSession] = None,
         tokenizer_2: Optional[CLIPTokenizer] = None,
         use_io_binding: Optional[bool] = None,
@@ -94,23 +105,28 @@ def __init__(
         """
         Args:
             vae_decoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the VAE decoder.
-            text_encoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the text encoder.
+                The ONNX Runtime inference session associated to the VAE decoder
             unet_session (`ort.InferenceSession`):
                 The ONNX Runtime inference session associated to the U-NET.
+            tokenizer (`CLIPTokenizer`):
+                Tokenizer of class
+                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+                for the text encoder.
             config (`Dict[str, Any]`):
                 A config dictionary from which the model components will be instantiated. Make sure to only load
                 configuration files of compatible classes.
-            tokenizer (`CLIPTokenizer`):
-                Tokenizer of class
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
             scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`):
                 A scheduler to be used in combination with the U-NET component to denoise the encoded image latents.
             feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`):
                 A model extracting features from generated images to be used as inputs for the `safety_checker`
             vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
                 The ONNX Runtime inference session associated to the VAE encoder.
+            text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
+                The ONNX Runtime inference session associated to the text encoder.
+            tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`):
+                Tokenizer of class
+                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+                for the second text encoder.
             use_io_binding (`Optional[bool]`, defaults to `None`):
                 Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
                 `True` if the device is CUDA, otherwise defaults to `False`.
@@ -118,7 +134,7 @@ def __init__(
                 The directory under which the model exported to ONNX was saved.
         """
         self.shared_attributes_init(
-            vae_decoder_session,
+            model=vae_decoder_session,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
         )
@@ -350,9 +366,9 @@ def _from_pretrained(
             text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
             unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
             vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            text_encoder_2_path=new_model_save_dir
-            / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER
-            / text_encoder_2_file_name,
+            text_encoder_2_path=(
+                new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name
+            ),
             provider=provider,
             session_options=session_options,
             provider_options=provider_options,
@@ -399,7 +415,7 @@ def _from_transformers(
         provider_options: Optional[Dict[str, Any]] = None,
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTStableDiffusionPipeline":
+    ) -> "ORTPipeline":
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
@@ -480,131 +496,142 @@ def _save_config(self, save_directory):
         self.save_config(save_directory)
 
 
-# TODO : Use ORTModelPart once IOBinding support is added
-class _ORTDiffusionModelPart:
-    """
-    For multi-file ONNX models, represents a part of the model.
-    It has its own `onnxruntime.InferenceSession`, and can perform a forward pass.
-    """
-
+class ORTPipelinePart(ORTModelPart):
     CONFIG_NAME = "config.json"
 
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
-        self.session = session
-        self.parent_model = parent_model
-        self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
-        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+    def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline):
         config_path = Path(session._model_path).parent / self.CONFIG_NAME
-        self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()}
+
+        if config_path.is_file():
+            # TODO: use FrozenDict
+            self.config = parent_model._dict_from_json_file(config_path)
+        else:
+            self.config = {}
+
+        super().__init__(session, parent_model)
 
     @property
-    def device(self):
-        return self.parent_model.device
+    def input_dtype(self):
+        # for backward compatibility and diffusion mixins (will be standardized in the future)
+        return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()}
 
-    @abstractmethod
-    def forward(self, *args, **kwargs):
-        pass
 
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
+class ORTModelTextEncoder(ORTPipelinePart):
+    def forward(self, input_ids: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(input_ids, torch.Tensor)
 
+        model_inputs = {"input_ids": input_ids}
 
-class ORTModelTextEncoder(_ORTDiffusionModelPart):
-    def forward(self, input_ids: np.ndarray):
-        onnx_inputs = {
-            "input_ids": input_ids,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
+        return ModelOutput(**model_outputs)
 
-class ORTModelUnet(_ORTDiffusionModelPart):
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
-        super().__init__(session, parent_model)
 
+class ORTModelUnet(ORTPipelinePart):
     def forward(
         self,
-        sample: np.ndarray,
-        timestep: np.ndarray,
-        encoder_hidden_states: np.ndarray,
-        text_embeds: Optional[np.ndarray] = None,
-        time_ids: Optional[np.ndarray] = None,
-        timestep_cond: Optional[np.ndarray] = None,
+        sample: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
     ):
-        onnx_inputs = {
+        use_torch = isinstance(sample, torch.Tensor)
+
+        model_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
+            "text_embeds": text_embeds,
+            "time_ids": time_ids,
+            "timestep_cond": timestep_cond,
         }
 
-        if text_embeds is not None:
-            onnx_inputs["text_embeds"] = text_embeds
-        if time_ids is not None:
-            onnx_inputs["time_ids"] = time_ids
-        if timestep_cond is not None:
-            onnx_inputs["timestep_cond"] = timestep_cond
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
 
+        return ModelOutput(**model_outputs)
 
-class ORTModelVaeDecoder(_ORTDiffusionModelPart):
-    def forward(self, latent_sample: np.ndarray):
-        onnx_inputs = {
-            "latent_sample": latent_sample,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
 
+class ORTModelVaeDecoder(ORTPipelinePart):
+    def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(latent_sample, torch.Tensor)
 
-class ORTModelVaeEncoder(_ORTDiffusionModelPart):
-    def forward(self, sample: np.ndarray):
-        onnx_inputs = {
-            "sample": sample,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        model_inputs = {"latent_sample": latent_sample}
+
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelVaeEncoder(ORTPipelinePart):
+    def forward(self, sample: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(sample, torch.Tensor)
+
+        model_inputs = {"sample": sample}
+
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
+    main_input_name = "prompt"
+    auto_model_class = StableDiffusionPipeline
+
     __call__ = StableDiffusionPipelineMixin.__call__
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
+    main_input_name = "prompt"
+    auto_model_class = StableDiffusionImg2ImgPipeline
+
     __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
+    main_input_name = "prompt"
+    auto_model_class = StableDiffusionInpaintPipeline
+
     __call__ = StableDiffusionInpaintPipelineMixin.__call__
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
-    __call__ = LatentConsistencyPipelineMixin.__call__
+    main_input_name = "prompt"
+    auto_model_class = LatentConsistencyModelPipeline
 
+    __call__ = LatentConsistencyPipelineMixin.__call__
 
-class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
-    auto_model_class = StableDiffusionXLImg2ImgPipeline
 
+class ORTStableDiffusionXLPipelineBase(ORTPipeline):
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -657,6 +684,9 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
+    main_input_name = "prompt"
+    auto_model_class = StableDiffusionXLPipeline
+
     __call__ = StableDiffusionXLPipelineMixin.__call__
 
 
@@ -666,4 +696,140 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
     """
 
+    main_input_name = "prompt"
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
+
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+
+
+SUPPORTED_ORT_PIPELINES = [
+    ORTStableDiffusionPipeline,
+    ORTStableDiffusionImg2ImgPipeline,
+    ORTStableDiffusionInpaintPipeline,
+    ORTLatentConsistencyModelPipeline,
+    ORTStableDiffusionXLPipeline,
+    ORTStableDiffusionXLImg2ImgPipeline,
+]
+
+
+def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
+    for ort_pipeline_class in SUPPORTED_ORT_PIPELINES:
+        if (
+            ort_pipeline_class.__name__ == pipeline_class_name
+            or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+        ):
+            return ort_pipeline_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}")
+
+
+class ORTDiffusionPipeline(ConfigMixin):
+    config_name = "model_index.json"
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
+
+        ort_pipeline_class = _get_pipeline_class(class_name)
+
+        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
+
+
+ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
+        ("latent-consistency", ORTLatentConsistencyModelPipeline),
+    ]
+)
+
+ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
+    ]
+)
+
+ORT_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+    ]
+)
+
+SUPPORTED_ORT_PIPELINES_MAPPINGS = [
+    ORT_TEXT2IMAGE_PIPELINES_MAPPING,
+    ORT_IMAGE2IMAGE_PIPELINES_MAPPING,
+    ORT_INPAINT_PIPELINES_MAPPING,
+]
+
+
+def _get_task_class(mapping, pipeline_class_name):
+    def _get_model_name(pipeline_class_name):
+        for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS:
+            for model_name, ort_pipeline_class in ort_pipelines_mapping.items():
+                if (
+                    ort_pipeline_class.__name__ == pipeline_class_name
+                    or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+                ):
+                    return model_name
+
+    model_name = _get_model_name(pipeline_class_name)
+
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
+
+    raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}")
+
+
+class ORTPipelineForTask(ConfigMixin):
+    config_name = "model_index.json"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
+
+        ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name)
+
+        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
+
+
+class ORTPipelineForText2Image(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForText2Image
+    ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForImage2Image(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForImage2Image
+    ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForInpainting(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForInpainting
+    ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 4ce3e4707ed..3cecadafe3e 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -46,7 +46,6 @@
 from ..onnx.utils import _get_external_data_paths
 from ..utils import check_if_transformers_greater
 from ..utils.file_utils import validate_file_exists
-from ..utils.normalized_config import NormalizedConfigManager
 from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 from .base import ORTDecoderForSeq2Seq, ORTEncoder
 from .constants import (
@@ -72,16 +71,6 @@
     from transformers.generation_utils import GenerationMixin
 
 
-# if check_if_transformers_greater("4.37.0"):
-#     # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin`
-#     # and it implements many new features including short and long form generation, and starts with 2 init tokens
-#     from transformers.models.whisper.generation_whisper import WhisperGenerationMixin
-# else:
-
-#     class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin):
-#         pass
-
-
 if check_if_transformers_greater("4.43.0"):
     from transformers.cache_utils import EncoderDecoderCache
 else:
@@ -1165,49 +1154,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
 
-    def __init__(
-        self,
-        encoder_session: ort.InferenceSession,
-        decoder_session: ort.InferenceSession,
-        config: "PretrainedConfig",
-        onnx_paths: List[str],
-        decoder_with_past_session: Optional[ort.InferenceSession] = None,
-        use_cache: bool = True,
-        use_io_binding: Optional[bool] = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        preprocessors: Optional[List] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            encoder_session,
-            decoder_session,
-            config,
-            onnx_paths,
-            decoder_with_past_session,
-            use_cache,
-            use_io_binding,
-            model_save_dir,
-            preprocessors,
-            generation_config,
-            **kwargs,
-        )
-
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        if config.model_type == "encoder-decoder":
-            self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.encoder.model_type
-            )(config.encoder)
-
-            self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
-            if self.decoder_with_past is not None:
-                self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                    config.decoder.model_type
-                )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoder(session, self)
 
@@ -1521,20 +1467,6 @@ def __init__(
             **kwargs,
         )
 
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.encoder.model_type
-        )(config.encoder)
-
-        self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.decoder.model_type
-        )(config.decoder)
-
-        if self.decoder_with_past is not None:
-            self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoderForVisionEncoderDecoder(session, self)
 
diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
index 41c85b5b6ac..630d463de73 100644
--- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py
+++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
@@ -36,7 +36,7 @@ def __call__(
         original_inference_steps: int = None,
         guidance_scale: float = 8.5,
         num_images_per_prompt: int = 1,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         output_type: str = "pil",
@@ -66,7 +66,7 @@ def __call__(
                 usually at the expense of lower image quality.
             num_images_per_prompt (`int`, defaults to 1):
                 The number of images to generate per prompt.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -121,7 +121,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         prompt_embeds = self._encode_prompt(
             prompt,
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
index 98bff0de44d..6cc47fab1b9 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -189,7 +189,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             )
 
         if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
         elif latents.shape != shape:
             raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
@@ -209,7 +217,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -248,7 +256,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -303,7 +311,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
index 81a6ffa1e04..a66035a789b 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -16,10 +16,9 @@
 from typing import Callable, List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import deprecate
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
 
@@ -72,6 +71,43 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents], axis=0)
+
+        # add noise to latents using the timesteps
+        if isinstance(generator, np.random.RandomState):
+            noise = generator.randn(*init_latents.shape).astype(dtype)
+        elif isinstance(generator, torch.Generator):
+            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
+        else:
+            raise ValueError(
+                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                f" {type(generator)}."
+            )
+
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        ).numpy()
+
+        return init_latents
+
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__
     def __call__(
         self,
@@ -83,7 +119,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
         output_type: str = "pil",
@@ -125,7 +161,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
                 A np.random.RandomState to make generation deterministic.
             prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
@@ -168,7 +204,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -191,31 +227,7 @@ def __call__(
 
         latents_dtype = prompt_embeds.dtype
         image = image.astype(latents_dtype)
-        # encode the init image into latents and scale the latents
-        init_latents = self.vae_encoder(sample=image)[0]
-
         scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-        init_latents = scaling_factor * init_latents
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
-        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
 
         # get the original timestep using init_timestep
         offset = self.scheduler.config.get("steps_offset", 0)
@@ -225,12 +237,8 @@ def __call__(
         timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
         timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
 
-        # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        )
-        init_latents = init_latents.numpy()
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -241,8 +249,6 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
-        latents = init_latents
-
         t_start = max(num_inference_steps - init_timestep + offset, 0)
         timesteps = self.scheduler.timesteps[t_start:].numpy()
 
@@ -276,7 +282,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
index 19de793ccd0..cb3c7db96e9 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -16,7 +16,7 @@
 from typing import Callable, List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import PIL_INTERPOLATION
@@ -108,7 +108,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -200,7 +200,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -229,11 +229,19 @@ def __call__(
             width // self.vae_scale_factor,
         )
         latents_dtype = prompt_embeds.dtype
+
         if latents is None:
-            latents = generator.randn(*latents_shape).astype(latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*latents_shape).astype(latents_dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
+        elif latents.shape != latents_shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
         # prepare mask and masked_image
         mask, masked_image = prepare_mask_and_masked_image(
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
index 2a5e7bf78b0..0407c16a77a 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -235,7 +235,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             )
 
         if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
         elif latents.shape != shape:
             raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
@@ -270,7 +278,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -315,7 +323,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -383,7 +391,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -440,6 +448,7 @@ def __call__(
         timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
 
         # 8. Denoising loop
+
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
@@ -475,7 +484,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
index a07903a735e..19988599b64 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -17,7 +17,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
@@ -222,7 +222,7 @@ def get_timesteps(self, num_inference_steps, strength):
         return timesteps, num_inference_steps - t_start
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
+    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
         batch_size = batch_size * num_images_per_prompt
 
         if image.shape[1] == 4:
@@ -242,11 +242,22 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
             init_latents = np.concatenate([init_latents], axis=0)
 
         # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(dtype)
+        if isinstance(generator, np.random.RandomState):
+            noise = generator.randn(*init_latents.shape).astype(dtype)
+        elif isinstance(generator, torch.Generator):
+            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
+        else:
+            raise ValueError(
+                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                f" {type(generator)}."
+            )
+
         init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep)
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
         )
-        return init_latents.numpy()
+        init_latents = init_latents.numpy()
+
+        return init_latents
 
     def _get_add_time_ids(
         self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
@@ -274,7 +285,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -375,7 +386,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -482,7 +493,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py
index 869b91ffe59..e9d5986b61c 100644
--- a/optimum/pipelines/diffusers/pipeline_utils.py
+++ b/optimum/pipelines/diffusers/pipeline_utils.py
@@ -17,7 +17,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers import ConfigMixin
 from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor
@@ -206,7 +206,7 @@ def postprocess(
 
     def get_height_width(
         self,
-        image: [PIL.Image.Image, np.ndarray],
+        image: Union[PIL.Image.Image, np.ndarray],
         height: Optional[int] = None,
         width: Optional[int] = None,
     ):
@@ -264,10 +264,10 @@ def reshape(images: np.ndarray) -> np.ndarray:
     # TODO : remove after diffusers v0.21.0 release
     def resize(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]:
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
         """
         Resize image.
         """
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index f6914bbcd3a..35d1ffe9fc7 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -79,3 +79,47 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
+
+
+class ORTDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index a55c7a124df..c8a33b0be35 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -298,7 +298,7 @@
 PYTORCH_DIFFUSION_MODEL = {
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
-    "lcm": "echarlaix/tiny-random-latent-consistency",
+    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
 }
 
 PYTORCH_TIMM_MODEL = {
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
new file mode 100644
index 00000000000..9f480b2d1a0
--- /dev/null
+++ b/tests/onnxruntime/test_diffusion.py
@@ -0,0 +1,793 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import PIL
+import pytest
+import torch
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    DiffusionPipeline,
+)
+from diffusers.utils import load_image
+from parameterized import parameterized
+from transformers.testing_utils import require_torch_gpu
+from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
+
+from optimum.onnxruntime import (
+    ORTDiffusionPipeline,
+    ORTPipelineForImage2Image,
+    ORTPipelineForInpainting,
+    ORTPipelineForText2Image,
+)
+from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
+from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
+
+
+def get_generator(framework, seed):
+    if framework == "np":
+        return np.random.RandomState(seed)
+    elif framework == "pt":
+        return torch.Generator().manual_seed(seed)
+    else:
+        raise ValueError(f"Unknown framework: {framework}")
+
+
+def _generate_prompts(batch_size=1):
+    inputs = {
+        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+        "num_inference_steps": 3,
+        "guidance_scale": 7.5,
+        "output_type": "np",
+    }
+    return inputs
+
+
+def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+    if input_type == "pil":
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        ).resize((width, height))
+    elif input_type == "np":
+        image = np.random.rand(height, width, channel)
+    elif input_type == "pt":
+        image = torch.rand((channel, height, width))
+
+    return [image] * batch_size
+
+
+def to_np(image):
+    if isinstance(image[0], PIL.Image.Image):
+        return np.stack([np.array(i) for i in image], axis=0)
+    elif isinstance(image, torch.Tensor):
+        return image.cpu().numpy().transpose(0, 2, 3, 1)
+    return image
+
+
+class ORTPipelineForText2ImageTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"]
+
+    ORTMODEL_CLASS = ORTPipelineForText2Image
+    AUTOMODEL_CLASS = AutoPipelineForText2Image
+
+    TASK = "text-to-image"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for num_images in [1, 3]:
+            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        if model_arch == "latent-consistency":
+            # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step
+            # TODO: Investigate why this is the case
+            inputs["num_inference_steps"] = 1
+
+        for output_type in ["latent", "np"]:
+            inputs["output_type"] = output_type
+
+            ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+            self.assertTrue(
+                np.allclose(ort_output, diffusers_output, atol=1e-4),
+                np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
+            )
+            self.assertEqual(ort_pipeline.device, diffusers_pipeline.device)
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 64, 32, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertTrue(auto_callback.has_been_called)
+        self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        height, width, batch_size = 128, 64, 1
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for output_type in ["np", "pil", "latent"]:
+            inputs["output_type"] = output_type
+            outputs = pipeline(**inputs).images
+            if output_type == "pil":
+                self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+            elif output_type == "np":
+                self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            else:
+                self.assertEqual(
+                    outputs.shape,
+                    (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        if model_arch in ["latent-consistency"]:
+            pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_negative_prompt(self, model_arch: str):
+        if model_arch in ["latent-consistency"]:
+            pytest.skip("Latent Consistency Model (LCM) does not support negative prompts")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        negative_prompt = ["This is a negative prompt"]
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_slice_1 = pipeline(
+            **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED)
+        ).images[0, -3:, -3:, -1]
+        prompt = inputs.pop("prompt")
+
+        if model_arch == "stable-diffusion-xl":
+            (
+                inputs["prompt_embeds"],
+                inputs["negative_prompt_embeds"],
+                inputs["pooled_prompt_embeds"],
+                inputs["negative_pooled_prompt_embeds"],
+            ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt)
+        else:
+            text_ids = pipeline.tokenizer(
+                prompt,
+                max_length=pipeline.tokenizer.model_max_length,
+                padding="max_length",
+                return_tensors="np",
+                truncation=True,
+            ).input_ids
+            negative_text_ids = pipeline.tokenizer(
+                negative_prompt,
+                max_length=pipeline.tokenizer.model_max_length,
+                padding="max_length",
+                return_tensors="np",
+                truncation=True,
+            ).input_ids
+            inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0]
+            inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0]
+
+        image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1]
+
+        self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1))
+
+
+class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+
+    AUTOMODEL_CLASS = AutoPipelineForImage2Image
+    ORTMODEL_CLASS = ORTPipelineForImage2Image
+
+    TASK = "image-to-image"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(list(SUPPORTED_ARCHITECTURES))
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        batch_size, height = 1, 32
+        for width in [64, 32]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+            for num_images in [1, 3]:
+                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        if model_arch in ["stable-diffusion"]:
+            pytest.skip(
+                "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)"
+            )
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        height, width, batch_size = 32, 64, 1
+
+        for input_type in ["np", "pil", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["np", "pil", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                else:
+                    self.assertEqual(
+                        outputs.shape,
+                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                    )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+        self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+
+class ORTPipelineForInpaintingTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion"]
+
+    AUTOMODEL_CLASS = AutoPipelineForInpainting
+    ORTMODEL_CLASS = ORTPipelineForInpainting
+
+    TASK = "inpainting"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        assert batch_size == 1, "Inpainting models only support batch_size=1"
+        assert input_type == "pil", "Inpainting models only support input_type='pil'"
+
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
+        )[0]
+        inputs["mask_image"] = _generate_images(
+            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
+        )[0]
+
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        batch_size, height = 1, 32
+        for width in [64, 32]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+            for num_images in [1, 3]:
+                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        height, width, batch_size = 32, 64, 1
+
+        for input_type in ["pil"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["np", "pil", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                else:
+                    self.assertEqual(
+                        outputs.shape,
+                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                    )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        if model_arch in ["stable-diffusion"]:
+            pytest.skip(
+                "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant"
+            )
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        latents_shape = (
+            batch_size,
+            ort_pipeline.vae_decoder.config["latent_channels"],
+            height // ort_pipeline.vae_scale_factor,
+            width // ort_pipeline.vae_scale_factor,
+        )
+
+        np_latents = np.random.rand(*latents_shape).astype(np.float32)
+        torch_latents = torch.from_numpy(np_latents)
+
+        ort_output = ort_pipeline(**inputs, latents=np_latents).images
+        diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images
+
+        self.assertTrue(
+            np.allclose(ort_output, diffusers_output, atol=1e-4),
+            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
+        )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+
+class ImageProcessorTest(unittest.TestCase):
+    def test_vae_image_processor_pt(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt"))
+        input_np = to_np(input_pt)
+
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
+            out_np = to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
+
+    def test_vae_image_processor_np(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_np = np.stack(_generate_images(height=8, width=8, input_type="np"))
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
+            out_np = to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
+
+    def test_vae_image_processor_pil(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil")
+
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
+            for i, o in zip(input_pil, out):
+                in_np = np.array(i)
+                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
+                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 4b44acb38ab..199b96342e7 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -89,15 +89,8 @@
     ORTModelForSpeechSeq2Seq,
     ORTModelForTokenClassification,
     ORTModelForVision2Seq,
-    ORTStableDiffusionPipeline,
 )
 from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
 from optimum.onnxruntime.modeling_ort import ORTModel
 from optimum.pipelines import pipeline
 from optimum.utils import (
@@ -108,7 +101,24 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
-from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm
+from optimum.utils.import_utils import is_diffusers_available
+from optimum.utils.testing_utils import (
+    grid_parameters,
+    remove_directory,
+    require_diffusers,
+    require_hf_token,
+    require_ort_rocm,
+)
+
+
+if is_diffusers_available():
+    from optimum.onnxruntime.modeling_diffusion import (
+        ORTModelTextEncoder,
+        ORTModelUnet,
+        ORTModelVaeDecoder,
+        ORTModelVaeEncoder,
+        ORTStableDiffusionPipeline,
+    )
 
 
 logger = logging.get_logger()
@@ -205,6 +215,7 @@ def test_load_seq2seq_model_from_empty_cache(self):
         with self.assertRaises(Exception):
             _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True)
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_cache(self):
         _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
 
@@ -218,6 +229,7 @@ def test_load_stable_diffusion_model_from_cache(self):
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_empty_cache(self):
         dirpath = os.path.join(
             default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--")
@@ -300,6 +312,7 @@ def test_load_seq2seq_model_unknown_provider(self):
         with self.assertRaises(ValueError):
             ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider")
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_hub(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
@@ -308,6 +321,7 @@ def test_load_stable_diffusion_model_from_hub(self):
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_load_stable_diffusion_model_cuda_provider(self):
@@ -321,6 +335,7 @@ def test_load_stable_diffusion_model_cuda_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -335,6 +350,7 @@ def test_load_stable_diffusion_model_rocm_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_diffusers
     def test_load_stable_diffusion_model_cpu_provider(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider"
@@ -346,6 +362,7 @@ def test_load_stable_diffusion_model_cpu_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cpu"))
 
+    @require_diffusers
     def test_load_stable_diffusion_model_unknown_provider(self):
         with self.assertRaises(ValueError):
             ORTStableDiffusionPipeline.from_pretrained(
@@ -478,6 +495,7 @@ def test_passing_session_options_seq2seq(self):
         self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3)
 
+    @require_diffusers
     def test_passing_session_options_stable_diffusion(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
@@ -772,6 +790,7 @@ def test_seq2seq_model_on_rocm_ep_str(self):
         self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_passing_provider_options_stable_diffusion(self):
@@ -810,6 +829,7 @@ def test_passing_provider_options_stable_diffusion(self):
             model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0"
         )
 
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
@@ -825,7 +845,7 @@ def test_stable_diffusion_model_on_cpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
-    # test string device input for to()
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu_str(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
@@ -841,6 +861,7 @@ def test_stable_diffusion_model_on_cpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu(self):
@@ -858,6 +879,7 @@ def test_stable_diffusion_model_on_gpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -876,6 +898,7 @@ def test_stable_diffusion_model_on_rocm_ep(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_stable_diffusion_model_on_gpu_id(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
@@ -899,7 +922,7 @@ def test_stable_diffusion_model_on_gpu_id(self):
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-    # test string device input for to()
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu_str(self):
@@ -916,6 +939,7 @@ def test_stable_diffusion_model_on_gpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -975,6 +999,7 @@ def test_save_seq2seq_model_without_past(self):
             self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
             self.assertTrue(CONFIG_NAME in folder_contents)
 
+    @require_diffusers
     def test_save_stable_diffusion_model(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
@@ -1050,6 +1075,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool):
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
             remove_directory(tmpdirname)
 
+    @require_diffusers
     def test_save_load_stable_diffusion_model_with_external_data(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
@@ -1180,6 +1206,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self):
             )
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
 
+    @require_diffusers
     @require_hf_token
     def test_push_stable_diffusion_model_with_external_data_to_hub(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py
deleted file mode 100644
index 44cd22ffecc..00000000000
--- a/tests/onnxruntime/test_stable_diffusion_pipeline.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-import unittest
-from typing import Dict
-
-import numpy as np
-import PIL
-import pytest
-import torch
-from diffusers import (
-    OnnxStableDiffusionImg2ImgPipeline,
-    StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
-)
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import floats_tensor
-from packaging.version import Version, parse
-from parameterized import parameterized
-from transformers.testing_utils import require_torch_gpu
-from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
-
-from optimum.onnxruntime import (
-    ORTLatentConsistencyModelPipeline,
-    ORTStableDiffusionImg2ImgPipeline,
-    ORTStableDiffusionInpaintPipeline,
-    ORTStableDiffusionPipeline,
-    ORTStableDiffusionXLImg2ImgPipeline,
-    ORTStableDiffusionXLPipeline,
-)
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
-from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
-from optimum.utils.import_utils import _diffusers_version
-from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
-
-
-if parse(_diffusers_version) > Version("0.21.4"):
-    from diffusers import LatentConsistencyModelPipeline
-
-
-def _generate_inputs(batch_size=1):
-    inputs = {
-        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-        "num_inference_steps": 3,
-        "guidance_scale": 7.5,
-        "output_type": "np",
-    }
-    return inputs
-
-
-def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
-    if input_type == "pil":
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-    elif input_type == "np":
-        image = np.random.rand(height, width, channel)
-    elif input_type == "pt":
-        image = torch.rand((channel, height, width))
-
-    return [image] * batch_size
-
-
-def to_np(image):
-    if isinstance(image[0], PIL.Image.Image):
-        return np.stack([np.array(i) for i in image], axis=0)
-    elif isinstance(image, torch.Tensor):
-        return image.cpu().numpy().transpose(0, 2, 3, 1)
-    return image
-
-
-class ORTStableDiffusionPipelineBase(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @require_diffusers
-    def test_load_vanilla_model_which_is_not_supported(self):
-        with self.assertRaises(Exception) as context:
-            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
-
-        self.assertIn(
-            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
-        )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_num_images_per_prompt(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
-
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_callback(self, model_arch: str):
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
-
-        pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
-        inputs = self.generate_inputs(height=64, width=64)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_shape(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width, batch_size = 128, 64, 1
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        if self.TASK == "image-to-image":
-            input_types = ["np", "pil", "pt"]
-        elif self.TASK == "text-to-image":
-            input_types = ["np"]
-        else:
-            input_types = ["pil"]
-
-        for input_type in input_types:
-            if self.TASK == "image-to-image":
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
-            else:
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for output_type in ["np", "pil", "latent"]:
-                inputs["output_type"] = output_type
-                outputs = pipeline(**inputs).images
-                if output_type == "pil":
-                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
-                elif output_type == "np":
-                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-                else:
-                    self.assertEqual(
-                        outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
-                    )
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["height"] = height
-        inputs["width"] = width
-        return inputs
-
-
-class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width = 128, 128
-
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["prompt"] = "A painting of a squirrel eating a burger"
-        inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED))
-
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ORTStableDiffusionPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        pipeline.safety_checker = None
-        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32
-
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": "sailing ship in storm by Leonardo da Vinci",
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        # Compare model outputs
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_negative_prompt(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        inputs["height"], inputs["width"] = 64, 32
-        negative_prompt = ["This is a negative prompt"]
-        np.random.seed(0)
-        image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
-        prompt = inputs.pop("prompt")
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = pipeline.tokenizer(
-                p,
-                padding="max_length",
-                max_length=pipeline.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
-            embeds.append(pipeline.text_encoder(text_inputs)[0])
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        np.random.seed(0)
-        image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
-        self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
-
-
-class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-
-class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline
-    TASK = "inpainting"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch])
-        height, width = 64, 64
-        latents_shape = (
-            1,
-            ort_pipeline.vae_decoder.config["latent_channels"],
-            height // ort_pipeline.vae_scale_factor,
-            width // ort_pipeline.vae_scale_factor,
-        )
-        inputs = self.generate_inputs(height=height, width=width)
-
-        np_latents = np.random.rand(*latents_shape).astype(np.float32)
-        torch_latents = torch.from_numpy(np_latents)
-
-        ort_outputs = ort_pipeline(**inputs, latents=np_latents).images
-        self.assertEqual(ort_outputs.shape, (1, height, width, 3))
-
-        diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images
-        self.assertEqual(diffusers_outputs.shape, (1, height, width, 3))
-
-        self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        return inputs
-
-
-class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_inference(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        height, width = 128, 128
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["image"] = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-        output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
-        expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
-
-        self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ImageProcessorTest(unittest.TestCase):
-    def test_vae_image_processor_pt(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt"))
-        input_np = to_np(input_pt)
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_np = np.stack(_create_image(height=8, width=8, input_type="np"))
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil")
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
-                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-
-class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "latent-consistency",
-    ]
-    ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    @unittest.skipIf(
-        parse(_diffusers_version) <= Version("0.21.4"),
-        "not supported with this diffusers version, needs diffusers>=v0.22.0",
-    )
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_scale": 8.5,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)

From 2179d33a5a9539f065f3c80ca548a05859481688 Mon Sep 17 00:00:00 2001
From: yuanwu2017 <yuan.wu@intel.com>
Date: Wed, 18 Sep 2024 16:30:52 +0800
Subject: [PATCH 03/50] Disable the exllama on all non-cuda devices. (#2003)

* Disable the exllama on all non-cuda devices.

1. Disable the exllama on all non-cuda devices.
2. Don't raise the error when running on non-cuda device.

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Refine the code

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Fix errors of make style

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Add hpu device

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Update optimum/gptq/constants.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Update optimum/gptq/quantizer.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Fix error of make style

Signed-off-by: yuanwu <yuan.wu@intel.com>

---------

Signed-off-by: yuanwu <yuan.wu@intel.com>
Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 optimum/gptq/quantizer.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 902af87bbb0..949d4d260df 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -546,7 +546,7 @@ def tmp(_, input, output):
 
         if self.bits == 4:
             # device not on gpu
-            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+            if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])):
                 if not self.disable_exllama:
                     logger.warning(
                         "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
@@ -589,13 +589,14 @@ def post_init_model(self, model):
                 The input model
         """
         if self.bits == 4 and not self.disable_exllama:
-            if get_device(model) == torch.device("cpu") or (
-                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+            if get_device(model).type != "cuda" or (
+                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"])
             ):
-                raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
-                )
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
 
         class StoreAttr(object):
             pass

From bf1befdf7076c12a904eddfef167bfeb3e4fa0f2 Mon Sep 17 00:00:00 2001
From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com>
Date: Wed, 18 Sep 2024 08:10:23 -0400
Subject: [PATCH 04/50] Add Parallel Cross Entropy (#2017)

---
 optimum/fx/parallelization/decomp.py          |   2 +-
 .../op_registry/op_handlers.py                |  35 +++-
 .../parallel_layers/__init__.py               |   1 +
 .../parallelization/parallel_layers/loss.py   | 163 ++++++++++++++++++
 optimum/fx/parallelization/passes.py          |  45 ++++-
 optimum/fx/parallelization/utils.py           |  34 ++++
 .../parallelization/test_tensor_parallel.py   |  20 +--
 7 files changed, 280 insertions(+), 20 deletions(-)
 create mode 100644 optimum/fx/parallelization/parallel_layers/loss.py

diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py
index 26258d451bf..5410818e929 100644
--- a/optimum/fx/parallelization/decomp.py
+++ b/optimum/fx/parallelization/decomp.py
@@ -197,7 +197,7 @@ def run(self, *args, **kwargs):
 def decompose_and_functionalize(
     graph_module: GraphModule,
     decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(),
-    leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention],
+    leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention, F.cross_entropy],
 ) -> Callable:
     """
     API to decompose and functionalize a high-level graph module.
diff --git a/optimum/fx/parallelization/op_registry/op_handlers.py b/optimum/fx/parallelization/op_registry/op_handlers.py
index 56b8fc16bc0..4a9c55e3764 100644
--- a/optimum/fx/parallelization/op_registry/op_handlers.py
+++ b/optimum/fx/parallelization/op_registry/op_handlers.py
@@ -19,7 +19,7 @@
 from torch.fx import Node
 
 from ..core import Config
-from ..utils import is_activation, is_embedding, is_linear
+from ..utils import is_activation, is_cross_entropy, is_cross_entropy_parallel_compatible, is_embedding, is_linear
 
 
 class Registry:
@@ -334,7 +334,16 @@ def propagate(self) -> List[int]:
         ndim = arg.meta["val"].ndim
         slice_dim = (slice_dim + ndim) % ndim
         if slice_dim == axis:
-            # slice on the parallel axis is not allowed
+            # slice on the parallel axis is not allowed, except it's a nop
+            start, stop, step = 0, arg.meta["val"].shape[axis], 1
+            if len(self.node.args) > 2:
+                start = self.node.args[2]
+            elif len(self.node.args) > 3:
+                stop = self.node.args[3]
+            elif len(self.node.args) > 4:
+                step = self.node.args[4]
+            if start == 0 and stop >= arg.meta["val"].shape[axis] and step == 1:
+                return [axis]
             return []
         return [axis]
 
@@ -404,12 +413,12 @@ def propagate(self) -> List[int]:
         if self.node.op in ["placeholder", "get_attr"]:
             return [None]
         elif self.node.op == "output":
-            for node in self.node.all_input_nodes:
-                # TODO: allow parallelized nodes in output, and append comm ops in graph tp all-gather
-                # parallelized output if intructed
-                if self.extract_axis(node) is not None:
-                    return []
-            return [None]
+            # does not care about if output is being parallelized right now, because if the output is loss,
+            # then it must be not parallelized as long as it comes from sharded cross entropy.
+            # TODO: append all-gather comm ops before all parallelized output nodes if instructed.
+            input_arg = self.node.all_input_nodes[0]
+            axis = self.extract_axis(input_arg)
+            return [axis]
         elif is_linear(self.node):
             input_arg = self.node.all_input_nodes[0]
             axis = self.extract_axis(input_arg)
@@ -438,6 +447,16 @@ def propagate(self) -> List[int]:
                 return [1, None] if self.config.enable_sequence_parallel else [None]
             else:
                 return []
+        elif is_cross_entropy(self.node):
+            logits = self.node.all_input_nodes[0]
+            axis = self.extract_axis(logits)
+            if axis is None or (
+                is_cross_entropy_parallel_compatible(self.node) and axis == logits.meta["val"].ndim - 1
+            ):
+                # for cross entropy, the input logits parallel axis can only be the last axis or None
+                return [None]
+            else:
+                return []
         elif is_activation(self.node):
             return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate()
 
diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py
index 9bfb13afdf6..474ae7f7eef 100644
--- a/optimum/fx/parallelization/parallel_layers/__init__.py
+++ b/optimum/fx/parallelization/parallel_layers/__init__.py
@@ -14,3 +14,4 @@
 # limitations under the License.
 from .embedding import VocabParallelEmbedding
 from .linear import ColumnParallelLinear, RowParallelLinear
+from .loss import VocabParallelCrossEntropyLoss, sharded_cross_entropy_wrapper_fn
diff --git a/optimum/fx/parallelization/parallel_layers/loss.py b/optimum/fx/parallelization/parallel_layers/loss.py
new file mode 100644
index 00000000000..0a11e33c08e
--- /dev/null
+++ b/optimum/fx/parallelization/parallel_layers/loss.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from ..core import ParallelExecutionCtx
+
+
+# Adapted from https://github.com/huggingface/nanotron/blob/main/src/nanotron/parallel/tensor_parallel/functional.py
+class _ShardedCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        sharded_logits: torch.Tensor,  # (batch_size, length, sharded_hidden_size)
+        target: torch.Tensor,  # (batch_size, length)
+        group: dist.ProcessGroup,
+    ):
+        # Maximum value along last dimension across all GPUs.
+        logits_max = torch.max(sharded_logits, dim=-1)[0]
+        dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=group)
+        # Subtract the maximum value.
+        sharded_logits = sharded_logits - logits_max.unsqueeze(dim=-1)
+
+        # Get the shard's indices
+        sharded_hidden_size = sharded_logits.shape[-1]
+        rank = dist.get_rank(group)
+        start_index = rank * sharded_hidden_size
+        end_index = start_index + sharded_hidden_size
+
+        # Create a mask of valid ids (1 means it needs to be masked).
+        target_mask = (target < start_index) | (target >= end_index)
+        masked_target = target.clone() - start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, shard-size] and target to a 1-D tensor of size [*].
+        logits_2d = sharded_logits.view(-1, sharded_hidden_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        if predicted_logits_1d.is_contiguous():
+            predicted_logits_1d = predicted_logits_1d.clone()
+        else:
+            predicted_logits_1d = predicted_logits_1d.contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        dist.all_reduce(predicted_logits, op=dist.ReduceOp.SUM, group=group)
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = sharded_logits
+        torch.exp(sharded_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM, group=group)
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        # Retrieve tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        sharded_hidden_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, sharded_hidden_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None, None
+
+
+def sharded_cross_entropy(sharded_logits: torch.Tensor, target: torch.Tensor, process_group: dist.ProcessGroup):
+    return _ShardedCrossEntropy.apply(sharded_logits, target, process_group)
+
+
+def sharded_cross_entropy_wrapper_fn(process_group: dist.ProcessGroup):
+    @wraps(sharded_cross_entropy)
+    def wrapper(
+        sharded_logits: torch.Tensor,
+        target: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        size_average: Optional[bool] = None,
+        ignore_index: int = -100,
+        reduce: Optional[bool] = None,
+        reduction: str = "mean",
+        label_smoothing: float = 0.0,
+    ):
+        if weight is not None or ignore_index != -100 or label_smoothing != 0.0:
+            raise ValueError(
+                "Does not support weighted mode, index ignoring and label smoothing in current parallel cross entropy implementation."
+            )
+        loss: torch.Tensor = sharded_cross_entropy(sharded_logits, target, process_group)
+
+        if size_average is not None or reduce is not None:
+            size_average = True if size_average is None else size_average
+            reduce = True if reduce is None else reduce
+
+            if size_average and reduce:
+                reduction = "mean"
+            elif reduce:
+                reduction = "sum"
+            else:
+                reduction = "none"
+
+        if reduction == "mean":
+            return loss.mean()
+        elif reduction == "sum":
+            return loss.sum()
+        return loss
+
+    return wrapper
+
+
+class VocabParallelCrossEntropyLoss(nn.Module):
+    """
+    Simple parallel cross entropy implementation which does not support weighted mode and label smoothing yet.
+    """
+
+    def __init__(self, ctx: ParallelExecutionCtx, reduction: str = "mean") -> None:
+        super(VocabParallelCrossEntropyLoss, self).__init__()
+        self.process_group = ctx.tp_group
+        self.reduction = reduction
+
+    def forward(self, sharded_logits: torch.Tensor, target: torch.Tensor):
+        loss: torch.Tensor = _ShardedCrossEntropy.apply(sharded_logits, target, self.process_group)
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py
index 14b652fff73..90155263281 100644
--- a/optimum/fx/parallelization/passes.py
+++ b/optimum/fx/parallelization/passes.py
@@ -26,8 +26,15 @@
 from .decomp import decompose_and_functionalize
 from .distributed import scatter
 from .op_registry import REGISTRY, FallbackParallelAxisPropagateHandler
-from .parallel_layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding
+from .parallel_layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelCrossEntropyLoss,
+    VocabParallelEmbedding,
+    sharded_cross_entropy_wrapper_fn,
+)
 from .utils import (
+    is_cross_entropy,
     is_embedding,
     is_linear,
     is_shape_consumer,
@@ -273,6 +280,11 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
                     info["sequence_parallel"] = False
                 self.place_marker_per_node(node, info)
 
+            elif is_cross_entropy(node):
+                axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis")
+                if axis_before is not None:
+                    self.place_marker_per_node(node, {"axis": "vocab"})
+
         return graph_module
 
 
@@ -343,6 +355,35 @@ def handle_embedding(node: Node, ctx: ParallelExecutionCtx) -> None:
             layer_cache[key] = new_mod
         setattr(parent_mod, field, new_mod)
 
+    @staticmethod
+    def handle_cross_entropy(node: Node, ctx: ParallelExecutionCtx) -> None:
+        axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis")
+        if axis is None:
+            return
+
+        assert axis in {"vocab"}, "Only support parallelization on vocab dim for now."
+        if node.op == "call_module":
+            graph_module = node.graph.owning_module
+            prefix_and_field = node.target.rsplit(".", maxsplit=1)
+            if len(prefix_and_field) == 2:
+                parent_mod = graph_module.get_submodule(prefix_and_field[0])
+                field = prefix_and_field[1]
+            else:
+                parent_mod = graph_module
+                field = node.target
+
+            mod: nn.CrossEntropyLoss = graph_module.get_submodule(node.target)
+            key, layer_cache = node.target, ctx.parallel_layer_cache
+            if key in layer_cache:
+                new_mod = layer_cache[key]
+            else:
+                assert ctx.compile_times == 0, "illegal path for recompilation"
+                new_mod = VocabParallelCrossEntropyLoss(ctx, reduction=mod.reduction)
+                layer_cache[key] = new_mod
+            setattr(parent_mod, field, new_mod)
+        else:
+            node.target = sharded_cross_entropy_wrapper_fn(process_group=ctx.tp_group)
+
     @staticmethod
     def handle_hard_coded_axis_param(node: Node, ctx: ParallelExecutionCtx) -> None:
         def extract_shape_from_node(node: Node) -> List[Any]:
@@ -384,6 +425,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
                 self.handle_linear(node, ctx)
             elif is_embedding(node):
                 self.handle_embedding(node, ctx)
+            elif is_cross_entropy(node):
+                self.handle_cross_entropy(node, ctx)
             # correct the attention head num in parallel setting
             elif is_shape_consumer(node):
                 self.handle_hard_coded_axis_param(node, ctx)
diff --git a/optimum/fx/parallelization/utils.py b/optimum/fx/parallelization/utils.py
index b7b1ccd41c8..3074638737f 100644
--- a/optimum/fx/parallelization/utils.py
+++ b/optimum/fx/parallelization/utils.py
@@ -82,6 +82,40 @@ def is_shape_generator(node: Node) -> bool:
     return node.op == "call_method" and node.target == "size"
 
 
+def is_cross_entropy(node: Node) -> bool:
+    if node.op == "call_function":
+        return node.target is F.cross_entropy
+    elif node.op == "call_module":
+        mod = node.graph.owning_module
+        return isinstance(mod.get_submodule(node.target), nn.CrossEntropyLoss)
+    return False
+
+
+def is_cross_entropy_parallel_compatible(node: Node) -> bool:
+    """
+    For now `VocabParallelCrossEntropyLoss` does not support weighted mode, index ignoring and label smoothing.
+    """
+    if node.op == "call_function":
+        weight = node.kwargs.get("weight", None)
+        ignore_index = node.kwargs.get("ignore_index", -100)
+        label_smoothing = node.kwargs.get("label_smoothing", 0.0)
+        if len(node.args) > 2 and weight is None:
+            weight = node.args[2]
+        if len(node.args) > 4 and ignore_index == -100:
+            ignore_index = node.args[4]
+        if len(node.args) > 7 and label_smoothing == 0.0:
+            label_smoothing = node.args[7]
+
+        return weight is None and ignore_index == -100 and label_smoothing == 0.0
+
+    elif node.op == "call_module":
+        mod: nn.CrossEntropyLoss = node.graph.owning_module.get_submodule(node.target)
+        weight, label_smoothing, ignore_index = mod.weight, mod.label_smoothing, mod.ignore_index
+        return weight is None and ignore_index == -100 and label_smoothing == 0.0
+
+    return False
+
+
 def stable_topological_sort(graph: Graph):
     def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]:
         args: List[torch.fx.node.Argument] = []
diff --git a/tests/fx/parallelization/test_tensor_parallel.py b/tests/fx/parallelization/test_tensor_parallel.py
index 9626fccec3b..8a00393c4d7 100644
--- a/tests/fx/parallelization/test_tensor_parallel.py
+++ b/tests/fx/parallelization/test_tensor_parallel.py
@@ -36,6 +36,7 @@
     "output_attentions": False,
     "output_hidden_states": False,
     "tie_word_embeddings": True,
+    "return_dict": True,
 }
 
 DUMMY_MODELS_TO_TEST = (
@@ -64,11 +65,10 @@ def prepare_dummy_inputs(
     seq_len: int = 10,
     device: Union[str, torch.device] = "cuda",
 ):
-    return {
-        "input_ids": torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device),
-        "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.int64, device=device),
-        "position_ids": torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, -1),
-    }
+    input_ids = torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device)
+    attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device)
+    labels = input_ids.clone()
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
 
 
 def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, model_kwargs: Dict[str, Any]):
@@ -82,8 +82,8 @@ def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, m
 
     model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs)
     inputs = prepare_dummy_inputs(model.config)
-    logits = model(**inputs)[0]
-    tensors = gather_at_main_process(tensor=logits, group=tp_group, rank=rank, world_size=world_size)
+    loss = model(**inputs).loss
+    tensors = gather_at_main_process(tensor=loss, group=tp_group, rank=rank, world_size=world_size)
 
     # check results at main worker process
     if rank == 0:
@@ -145,7 +145,7 @@ def run_test_parallel_results_matches_non_parallel(
     inputs = prepare_dummy_inputs(model.config)
 
     set_seed(SEED)
-    logits = model(**inputs)[0]
+    loss = model(**inputs).loss
 
     torch._dynamo.reset()
     del model
@@ -154,9 +154,9 @@ def run_test_parallel_results_matches_non_parallel(
     set_seed(SEED)
     ctx = ParallelExecutionCtx(tp_group=tp_group, current_device=device)
     model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs)
-    parallel_logits = model(**inputs)[0]
+    parallel_loss = model(**inputs).loss
 
-    torch.testing.assert_close(logits.cpu(), parallel_logits.cpu(), rtol=1e-4, atol=1e-4)
+    torch.testing.assert_close(loss.cpu(), parallel_loss.cpu(), rtol=1e-4, atol=1e-4)
 
     dist.barrier(tp_group)
     tearDown()

From 2fb5ea5ca7ca8ea887af2851cce80ab2545d3f4f Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 18 Sep 2024 22:48:34 +0200
Subject: [PATCH 05/50] Fix `is_torch_tpu_available` in ORT Trainer (#2028)

---
 optimum/onnxruntime/trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 86c333adb3f..66273cbcf96 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -103,14 +103,14 @@
     from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
 
 if check_if_transformers_greater("4.39"):
-    from transformers.utils import is_torch_xla_available
+    from transformers.utils import is_torch_xla_available as is_torch_tpu_xla_available
 
-    if is_torch_xla_available():
+    if is_torch_tpu_xla_available():
         import torch_xla.core.xla_model as xm
 else:
-    from transformers.utils import is_torch_tpu_available
+    from transformers.utils import is_torch_tpu_available as is_torch_tpu_xla_available
 
-    if is_torch_tpu_available(check_device=False):
+    if is_torch_tpu_xla_available(check_device=False):
         import torch_xla.core.xla_model as xm
 
 if TYPE_CHECKING:
@@ -735,7 +735,7 @@ def get_dataloader_sampler(dataloader):
 
                 if (
                     args.logging_nan_inf_filter
-                    and not is_torch_tpu_available()
+                    and not is_torch_tpu_xla_available()
                     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                 ):
                     # if loss is nan or inf simply add the average of previous logged losses

From fd638d20046a73a7221083b23c69b98445e2d321 Mon Sep 17 00:00:00 2001
From: Vijay <vijaysinghkushwaha3737@gmail.com>
Date: Thu, 26 Sep 2024 12:59:19 +0530
Subject: [PATCH 06/50] Added image-to-image task for ORT Pipeline  (#2031)

* Add ORTModelForImageToImage for image-to-image task SwinSR

* Added image-to-image task to optimum pipeline

* Add Tests fpr ORTModelForImageToImage for image-to-image task SwinSR

* Use export=True for models from transformers, self._setup and more

* Code Refactor

* Refactor ORTModelForImageToImageIntegrationTest
---
 optimum/onnxruntime/__init__.py              |   2 +
 optimum/onnxruntime/modeling_ort.py          |  73 ++++++++++
 optimum/pipelines/pipelines_base.py          |   8 ++
 tests/onnxruntime/test_modeling.py           | 136 ++++++++++++++++++-
 tests/onnxruntime/utils_onnxruntime_tests.py |   1 +
 5 files changed, 219 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 09a48ec955c..1cb5b7c47b9 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -44,6 +44,7 @@
         "ORTModelForSemanticSegmentation",
         "ORTModelForSequenceClassification",
         "ORTModelForTokenClassification",
+        "ORTModelForImageToImage",
     ],
     "modeling_seq2seq": [
         "ORTModelForSeq2SeqLM",
@@ -112,6 +113,7 @@
         ORTModelForCustomTasks,
         ORTModelForFeatureExtraction,
         ORTModelForImageClassification,
+        ORTModelForImageToImage,
         ORTModelForMaskedLM,
         ORTModelForMultipleChoice,
         ORTModelForQuestionAnswering,
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 254b771e334..9166f7c2cbe 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -34,6 +34,7 @@
     AutoModelForAudioXVector,
     AutoModelForCTC,
     AutoModelForImageClassification,
+    AutoModelForImageToImage,
     AutoModelForMaskedLM,
     AutoModelForMultipleChoice,
     AutoModelForQuestionAnswering,
@@ -47,6 +48,7 @@
     BaseModelOutput,
     CausalLMOutput,
     ImageClassifierOutput,
+    ImageSuperResolutionOutput,
     MaskedLMOutput,
     ModelOutput,
     MultipleChoiceModelOutput,
@@ -2183,6 +2185,77 @@ def forward(
         return TokenClassifierOutput(logits=logits)
 
 
+IMAGE_TO_IMAGE_EXAMPLE = r"""
+    Example of image-to-image (Super Resolution):
+
+    ```python
+    >>> from transformers import {processor_class}
+    >>> from optimum.onnxruntime import {model_class}
+    >>> from PIL import Image
+
+    >>> image = Image.open("path/to/image.jpg")
+
+    >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(images=image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    ```
+"""
+
+
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTModelForImageToImage(ORTModel):
+    """
+    ONNX Model for image-to-image tasks. This class officially supports pix2pix, cyclegan, wav2vec2, wav2vec2-conformer.
+    """
+
+    auto_model_class = AutoModelForImageToImage
+
+    @add_start_docstrings_to_model_forward(
+        ONNX_IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width")
+        + IMAGE_TO_IMAGE_EXAMPLE.format(
+            processor_class=_PROCESSOR_FOR_DOC,
+            model_class="ORTModelForImgageToImage",
+            checkpoint="caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr",
+        )
+    )
+    def forward(
+        self,
+        pixel_values: Union[torch.Tensor, np.ndarray],
+        **kwargs,
+    ):
+        use_torch = isinstance(pixel_values, torch.Tensor)
+        self.raise_on_numpy_input_io_binding(use_torch)
+        if self.device.type == "cuda" and self.use_io_binding:
+            input_shapes = pixel_values.shape
+            io_binding, output_shapes, output_buffers = self.prepare_io_binding(
+                pixel_values,
+                ordered_input_names=self._ordered_input_names,
+                known_output_shapes={
+                    "reconstruction": [
+                        input_shapes[0],
+                        input_shapes[1],
+                        input_shapes[2] * self.config.upscale,
+                        input_shapes[3] * self.config.upscale,
+                    ]
+                },
+            )
+            io_binding.synchronize_inputs()
+            self.model.run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+            reconstruction = output_buffers["reconstruction"].view(output_shapes["reconstruction"])
+        else:
+            model_inputs = {"pixel_values": pixel_values}
+            onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+            onnx_outputs = self.model.run(None, onnx_inputs)
+            model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+            reconstruction = model_outputs["reconstruction"]
+        return ImageSuperResolutionOutput(reconstruction=reconstruction)
+
+
 CUSTOM_TASKS_EXAMPLE = r"""
     Example of custom tasks(e.g. a sentence transformers taking `pooler_output` as output):
 
diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py
index a08ab8782a3..7690143f13f 100644
--- a/optimum/pipelines/pipelines_base.py
+++ b/optimum/pipelines/pipelines_base.py
@@ -24,6 +24,7 @@
     FillMaskPipeline,
     ImageClassificationPipeline,
     ImageSegmentationPipeline,
+    ImageToImagePipeline,
     ImageToTextPipeline,
     Pipeline,
     PreTrainedTokenizer,
@@ -55,6 +56,7 @@
         ORTModelForCausalLM,
         ORTModelForFeatureExtraction,
         ORTModelForImageClassification,
+        ORTModelForImageToImage,
         ORTModelForMaskedLM,
         ORTModelForQuestionAnswering,
         ORTModelForSemanticSegmentation,
@@ -157,6 +159,12 @@
             "default": "superb/hubert-base-superb-ks",
             "type": "audio",
         },
+        "image-to-image": {
+            "impl": ImageToImagePipeline,
+            "class": (ORTModelForImageToImage,),
+            "default": "caidas/swin2SR-classical-sr-x2-64",
+            "type": "image",
+        },
     }
 else:
     ORT_SUPPORTED_TASKS = {}
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 199b96342e7..f6771ce7618 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -42,6 +42,7 @@
     AutoModelForCausalLM,
     AutoModelForCTC,
     AutoModelForImageClassification,
+    AutoModelForImageToImage,
     AutoModelForMaskedLM,
     AutoModelForMultipleChoice,
     AutoModelForQuestionAnswering,
@@ -57,7 +58,9 @@
     PretrainedConfig,
     set_seed,
 )
+from transformers.modeling_outputs import ImageSuperResolutionOutput
 from transformers.modeling_utils import no_init_weights
+from transformers.models.swin2sr.configuration_swin2sr import Swin2SRConfig
 from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow
 from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
@@ -79,6 +82,7 @@
     ORTModelForCustomTasks,
     ORTModelForFeatureExtraction,
     ORTModelForImageClassification,
+    ORTModelForImageToImage,
     ORTModelForMaskedLM,
     ORTModelForMultipleChoice,
     ORTModelForPix2Struct,
@@ -4704,6 +4708,136 @@ def test_compare_generation_to_io_binding(
         gc.collect()
 
 
+class ORTModelForImageToImageIntegrationTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = ["swin2sr"]
+
+    ORTMODEL_CLASS = ORTModelForImageToImage
+
+    TASK = "image-to-image"
+
+    def _get_sample_image(self):
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        return image
+
+    def _get_preprocessors(self, model_id):
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+
+        return image_processor
+
+    def test_load_vanilla_transformers_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = ORTModelForImageToImage.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn("only supports the tasks", str(context.exception))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertIsInstance(onnx_model.config, Swin2SRConfig)
+        set_seed(SEED)
+
+        transformers_model = AutoModelForImageToImage.from_pretrained(model_id)
+        image_processor = self._get_preprocessors(model_id)
+
+        data = self._get_sample_image()
+        features = image_processor(data, return_tensors="pt")
+
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**features)
+
+        onnx_outputs = onnx_model(**features)
+        self.assertIsInstance(onnx_outputs, ImageSuperResolutionOutput)
+        self.assertTrue("reconstruction" in onnx_outputs)
+        self.assertIsInstance(onnx_outputs.reconstruction, torch.Tensor)
+        self.assertTrue(torch.allclose(onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=1e-4))
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_generate_utils(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+
+        data = self._get_sample_image()
+        features = image_processor(data, return_tensors="pt")
+
+        outputs = onnx_model(**features)
+        self.assertIsInstance(outputs, ImageSuperResolutionOutput)
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_image_to_image(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+        )
+        data = self._get_sample_image()
+        outputs = pipe(data)
+        self.assertEqual(pipe.device, onnx_model.device)
+        self.assertIsInstance(outputs, Image.Image)
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    def test_pipeline_on_gpu(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+            device=0,
+        )
+
+        data = self._get_sample_image()
+        outputs = pipe(data)
+
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        self.assertIsInstance(outputs, Image.Image)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+            device=0,
+        )
+
+        data = self._get_sample_image()
+        outputs = pipe(data)
+
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        self.assertIsInstance(outputs, Image.Image)
+
+
 class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin):
     SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr", "donut"]
 
@@ -4831,7 +4965,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
                         len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0])
                     )
                     for i in range(len(onnx_outputs["past_key_values"])):
-                        print(onnx_outputs["past_key_values"][i])
                         for ort_pkv, trfs_pkv in zip(
                             onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i]
                         ):
@@ -5517,6 +5650,7 @@ class TestBothExportersORTModel(unittest.TestCase):
             ["automatic-speech-recognition", ORTModelForCTCIntegrationTest],
             ["audio-xvector", ORTModelForAudioXVectorIntegrationTest],
             ["audio-frame-classification", ORTModelForAudioFrameClassificationIntegrationTest],
+            ["image-to-image", ORTModelForImageToImageIntegrationTest],
         ]
     )
     def test_find_untested_architectures(self, task: str, test_class):
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index bb6935461d7..0790f6329dc 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -144,6 +144,7 @@
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
     "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",
+    "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "trocr": "microsoft/trocr-small-handwritten",

From f7c3a7fa766f06af63a15e94b162ada56d021b16 Mon Sep 17 00:00:00 2001
From: Guillaume LEGENDRE <glegendre01@gmail.com>
Date: Fri, 27 Sep 2024 14:12:14 +0200
Subject: [PATCH 07/50] CI - update runner type (#2033)

update runner type
---
 .github/workflows/test_fx_automatic_parallel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml
index d8af6e40caa..05ebf7ea9e5 100644
--- a/.github/workflows/test_fx_automatic_parallel.yml
+++ b/.github/workflows/test_fx_automatic_parallel.yml
@@ -24,7 +24,7 @@ jobs:
         config:
           - name: GPU-enabled Optimum Test Suite
             image: nvidia/cuda:12.4.1-devel-ubuntu22.04
-        gpu_target: ["nvidia-multi-gpu-a10-runners"]
+        gpu_target: ["aws-g5-12xlarge-plus"]
 
     name: ${{ matrix.config.name }}
     runs-on:

From c6b46786ce12b3a9d2e8be2b8f41342ec314f46a Mon Sep 17 00:00:00 2001
From: rbrugaro <rita.brugarolas.brufau@intel.com>
Date: Mon, 30 Sep 2024 02:50:23 -0700
Subject: [PATCH 08/50] Add ipex to documentation (#2027)

* adding ipex reference in optimum docs

* minor fix
---
 docs/source/index.mdx        | 2 +-
 docs/source/installation.mdx | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 7eb79c33ed2..06133664ca8 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -36,7 +36,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./intel/index"
       ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Intel</div>
-      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span></p>
+      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> , <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://intel.github.io/intel-extension-for-pytorch/index.html', '_blank');">IPEX</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-neuron/index"
       ><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c08b3f92e5c..27733574c80 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 | [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]`       |
 | [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` |
 | [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade --upgrade-strategy eager optimum[openvino]`          |
+| [IPEX](https://huggingface.co/docs/optimum/intel/index)                                                                | `pip install --upgrade --upgrade-strategy eager optimum[ipex]`              |
 | [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`           |
 | [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade --upgrade-strategy eager optimum[amd]`               |
 | [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]`           |

From 049b00f61c9bb17bd2b20a3b77d04cc4c0f20d86 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:51:01 +0200
Subject: [PATCH 09/50] Add Transformers v4.45 support (#2023)

* transformers v4.45 support

* fix transformers v4.45 compatibility

* update opset

* update model

* Add generation config saving

* fix codegen

* bump default opset m2m100

* fix codegen

* fix bettertransformers

* add warnign deprecation bettertransformer

* bettertransformers fixes

* disable transformers 4.45 for onnx export

* update model ID
---
 Makefile                                      |  4 +-
 optimum/bettertransformer/models/attention.py | 84 +++++++++++++++++--
 .../models/decoder_models.py                  | 35 +++++++-
 optimum/bettertransformer/transformation.py   |  4 +
 optimum/exporters/onnx/convert.py             | 18 ++++
 optimum/exporters/onnx/model_configs.py       | 11 +--
 optimum/modeling_base.py                      |  3 +
 optimum/onnxruntime/modeling_decoder.py       | 58 ++++++++-----
 optimum/onnxruntime/modeling_ort.py           |  3 -
 optimum/onnxruntime/modeling_seq2seq.py       | 64 +++++++-------
 optimum/onnxruntime/optimization.py           | 12 ++-
 setup.py                                      |  9 +-
 tests/bettertransformer/testing_utils.py      |  4 +-
 tests/onnxruntime/utils_onnxruntime_tests.py  |  6 +-
 14 files changed, 223 insertions(+), 92 deletions(-)

diff --git a/Makefile b/Makefile
index e2c21263031..824ef3d0cf3 100644
--- a/Makefile
+++ b/Makefile
@@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 # Run code quality checks
 style_check:
 	black --check .
-	ruff .
+	ruff check .
 
 style:
 	black .
-	ruff . --fix
+	ruff check . --fix
 
 # Run tests for the library
 test:
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 9dfa57844d4..22b8faf1c21 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -92,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product(
     return sdpa_result, None
 
 
+# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn
+def gptj_wrapped_scaled_dot_product(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        query = query.to(value.dtype)
+        key = key.to(value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this
+        # is unnecessary
+        if query_length > 1:
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+                causal_mask = torch.where(causal_mask, 0, mask_value)
+
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                if attention_mask is not None:
+                    attention_mask = causal_mask + attention_mask
+
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        sdpa_result = sdpa_result.to(value.dtype)
+
+    return sdpa_result, None
+
+
 # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn
 def bark_wrapped_scaled_dot_product(
     self,
@@ -195,7 +260,7 @@ def codegen_wrapped_scaled_dot_product(
                 query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
             )
         else:
-            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # in this case, which is the later decoding steps, the `causal_mask` in
             # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
             # is [True, ..., True] so actually not causal
             sdpa_result = torch.nn.functional.scaled_dot_product_attention(
@@ -207,15 +272,20 @@ def codegen_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(
+                    torch.bool
+                )
 
-            causal_mask = torch.where(causal_mask, 0, mask_value)
+                causal_mask = torch.where(causal_mask, 0, mask_value)
 
-            # torch.Tensor.expand does no memory copy
-            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
 
-            # we use torch.min to avoid having tensor(-inf)
-            attention_mask = torch.min(causal_mask, attention_mask)
+                # we use torch.min to avoid having tensor(-inf)
+                attention_mask = torch.min(causal_mask, attention_mask)
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index b64b7f5a1eb..52d28d076d3 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -44,6 +44,7 @@
     codegen_wrapped_scaled_dot_product,
     gpt2_wrapped_scaled_dot_product,
     gpt_neo_wrapped_scaled_dot_product,
+    gptj_wrapped_scaled_dot_product,
     opt_forward,
     t5_forward,
 )
@@ -82,7 +83,7 @@ def forward(self, *args, **kwargs):
 
 
 class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module):
-    _attn = gpt2_wrapped_scaled_dot_product
+    _attn = gptj_wrapped_scaled_dot_product
 
     def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         super().__init__(config)
@@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             "out_proj",
             "attn_dropout",
             "resid_dropout",
-            "bias",
             "scale_attn",
-            "masked_bias",
         ]
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "bias"):
+            submodules.append("bias")
+        if hasattr(layer, "masked_bias"):
+            submodules.append("masked_bias")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_id"):
+            submodules.append("layer_id")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -238,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             super(BetterTransformerBaseLayer, self).__init__(config)
 
         self.module_mapping = None
-        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"]
+        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"]
 
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "causal_mask"):
+            submodules.append("causal_mask")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index 2105e199870..a101757b6fa 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -206,6 +206,10 @@ def transform(
             The converted model if the conversion has been successful.
         """
 
+        logger.warning(
+            "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release."
+        )
+
         hf_config = model.config
         if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]:
             raise ValueError(
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 63a9067b90c..f2bf95f3e3c 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import onnx
+import transformers
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -34,6 +35,7 @@
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
     TORCH_MINIMUM_VERSION,
+    check_if_transformers_greater,
     is_diffusers_available,
     is_torch_onnx_support_available,
     logging,
@@ -999,6 +1001,10 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
+    if check_if_transformers_greater("4.44.99"):
+        raise ImportError(
+            f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}"
+        )
 
     TasksManager.standardize_model_attributes(model)
 
@@ -1120,6 +1126,18 @@ def onnx_export_from_model(
             if isinstance(atol, dict):
                 atol = atol[task.replace("-with-past", "")]
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            if model.can_generate() and len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(model.generation_config, param_name, param_value)
+                    setattr(model.config, param_name, None)
+
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d4b15b2968b..36963a986d0 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class AlbertOnnxConfig(BertOnnxConfig):
-    DEFAULT_ONNX_OPSET = 11
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class ConvBertOnnxConfig(BertOnnxConfig):
@@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig):
 
 
 class RobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class CamembertOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class FlaubertOnnxConfig(BertOnnxConfig):
@@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig):
 
 
 class XLMRobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class DebertaOnnxConfig(BertOnnxConfig):
@@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig):
 
 
 class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads")
 
 
@@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 3da2d9d0d21..29521b7c0c6 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -371,6 +371,9 @@ def from_pretrained(
             export = from_transformers
 
         if len(model_id.split("@")) == 2:
+            logger.warning(
+                f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead."
+            )
             if revision is not None:
                 logger.warning(
                     f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}"
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index f6d4b7e20ab..bda3ec98d9a 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -14,7 +14,6 @@
 """Classes handling causal-lm related architectures in ONNX Runtime."""
 
 import logging
-import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -149,6 +148,19 @@ def __init__(
             generation_config = GenerationConfig.from_model_config(config)
 
         self.generation_config = generation_config
+
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
         self.onnx_paths = [self.model_path]
         self.use_merged = "use_cache_branch" in self.input_names
         self.model_type = self.config.model_type
@@ -393,7 +405,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -410,15 +421,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -586,6 +589,22 @@ def _from_pretrained(
         else:
             init_cls = ORTModelForCausalLM
 
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
+
         return init_cls(
             model=model,
             config=config,
@@ -593,6 +612,7 @@ def _from_pretrained(
             model_save_dir=model_save_dir,
             preprocessors=preprocessors,
             use_cache=use_cache,
+            generation_config=generation_config,
         )
 
     @classmethod
@@ -600,7 +620,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -616,15 +635,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         file_name = ONNX_WEIGHTS_NAME
 
         if use_merged:
@@ -655,8 +665,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -712,6 +720,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
             for layer_past in past
         )
 
+    def _save_pretrained(self, save_directory: Union[str, Path]):
+        super()._save_pretrained(save_directory)
+        self.generation_config.save_pretrained(save_directory)
+
 
 class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 9166f7c2cbe..17bd3e2a4e7 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -663,8 +663,6 @@ def _export(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -1171,7 +1169,6 @@ def _export(
             library_name="transformers",
         )
 
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 3cecadafe3e..fda3ca82bbe 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -18,7 +18,6 @@
 
 import logging
 import shutil
-import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -706,6 +705,18 @@ def show_deprecated_argument(arg_name):
             generation_config = GenerationConfig.from_model_config(config)
         self.generation_config = generation_config
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
     @abstractmethod
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         pass
@@ -780,7 +791,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -799,15 +809,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -996,19 +998,21 @@ def _from_pretrained(
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
-        generation_config = None
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder=subfolder,
-            )
-        except OSError:
-            logger.info("Generation config file not found, using a generation config created from the model config.")
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
 
         onnx_paths = [encoder_path]
         if use_merged is False:
@@ -1035,7 +1039,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -1051,15 +1054,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForConditionalGeneration":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         if use_cache is False and use_merged is True:
             raise ValueError(
                 "The incompatible arguments use_cache=False, use_merged=True were passed to"
@@ -1091,8 +1085,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py
index 9e62a3f324c..fd6958bba7d 100644
--- a/optimum/onnxruntime/optimization.py
+++ b/optimum/onnxruntime/optimization.py
@@ -20,6 +20,7 @@
 
 import onnx
 from onnx import load_model
+from transformers import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 
 from onnxruntime.transformers.onnx_model_bert import BertOnnxModel
@@ -152,10 +153,6 @@ def optimize(
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config)
-
-        self.config.save_pretrained(save_dir)
-        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
-
         model_type = ORTConfigManager.get_model_ort_type(self.config.model_type)
         optimization_options = optimization_config.create_fusion_options(model_type)
 
@@ -236,6 +233,13 @@ def optimize(
         # Save the model configuration
         self.config.save_pretrained(save_dir)
         ort_config.save_pretrained(save_dir)
+        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
+
+        try:
+            generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent)
+            generation_config.save_pretrained(save_dir)
+        except Exception:
+            pass
 
         logger.info(
             f"Optimized model saved at: {save_dir} (external data format: "
diff --git a/setup.py b/setup.py
index ac5db71a74b..24c1ae1cd4d 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29,<4.45.0",
+    "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
@@ -54,6 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
+        "transformers<4.45.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -62,9 +63,10 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
+        "transformers<4.45.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
+    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"],
+    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -75,6 +77,7 @@
         "numpy<1.24.0",
         "datasets<=2.16",
         "transformers[sentencepiece]>=4.26,<4.38",
+        "transformers<4.45.0",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index e9e2edd9790..098882180aa 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -59,12 +59,12 @@
     # "llama": "fxmarty/tiny-llama-fast-tokenizer",
     # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer",
     "m2m_100": "hf-internal-testing/tiny-random-nllb",
-    "marian": "fxmarty/tiny-marian",  # the other tiny ones have a too small max_position_embeddings
+    "marian": "optimum-internal-testing/tiny-random-marian",  # the other tiny ones have a too small max_position_embeddings
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
-    "prophetnet": "hirotasoshu/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
+    "prophetnet": "optimum-internal-testing/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
     "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "rocbert": "hf-internal-testing/tiny-random-RoCBertModel",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 0790f6329dc..17f3b391b04 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -112,9 +112,9 @@
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama": "optimum-internal-testing/tiny-random-llama",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
-    "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
+    "marian": "echarlaix/tiny-random-marian",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
@@ -152,7 +152,7 @@
     "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel",
     "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2",
     "vit": "hf-internal-testing/tiny-random-vit",
-    "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
+    "whisper": "optimum-internal-testing/tiny-random-whisper",
     "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model",
     "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer",
     "wavlm": "hf-internal-testing/tiny-random-WavlmModel",

From d9754abdd973a69829dda191c495c4e70359d8dc Mon Sep 17 00:00:00 2001
From: Vijay <vijaysinghkushwaha3737@gmail.com>
Date: Tue, 8 Oct 2024 15:58:24 +0530
Subject: [PATCH 10/50] Remove numpy version constraint in setup.py (#2039)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 24c1ae1cd4d..0e2f0fd1bb6 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
-    "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
+    "numpy",
     "huggingface_hub>=0.8.0",
     "datasets",
 ]

From d3c56cd55444de15499c8d72a501d07631eff5ae Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:17:43 +0200
Subject: [PATCH 11/50] Update/Fix Pipeline Mixins and ORT Pipelines (#2021)

* created auto task mappings

* added correct auto classes

* created auto task mappings

* added correct auto classes

* added ort/auto diffusion classes

* fix ORTPipeline detection

* start test refactoring

* dynamic dtype

* support torch random numbers generator

* compact diffusion testing suite

* fix

* test

* test

* test

* use latent-consistency architecture name instead of lcm

* fix

* add ort diffusion pipeline tests

* added dummy objects

* remove duplicate code

* update stable diffusion mixin

* update latent consistency

* update sd for img2img

* update latent consistency

* update model parts to use frozen dict

* update tests and utils

* updated all mixins, enabled all tests ; all are passing except some reproducibility and comparaison tests (7 failed, 35 passed)

* fix sd xl hidden states

* style

* support testing without diffusers

* remove unnecessary

* revert

* export vae encoder by returning its latent distribution parameters

* fix the modeling to handle distributions

* create vae class to minimize changes in pipeline mixins

* remove unnecessary tests

* style

* style

* update diffusion models export test

* style

* fall back for when block_out_channels is not in vae config

* remove model parts from optimum.onnxruntime

* added .to to model parts

* remove custom mixins

* style

* Update optimum/exporters/onnx/model_configs.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/exporters/onnx/model_configs.py

* conversion to numpy always work

* test adding two new pipelines

* remove duplicated tests

* match diffusers numpy input

* simplify model saving

* extend tests and only translate generators

* cleanup

* reduce parent model usage in model parts

* fix

* new tiny onnx diffusion model with configs

* model_save_path

* Update optimum/onnxruntime/modeling_diffusion.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* migrate tiny-stable-diffusion-onnx

* resolve breaking change and mandatory arguments

* overwrite _get_add_time_ids

* fix

* remove inference calls from loading tests

* misc

* better compatibility between model parts and parent pipeline

* remove subfolder

* misc

* update

* support passing safety checker

* dummies

* remove the need for ORTPipeline

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/exporters/onnx/model_configs.py       |   6 +-
 optimum/exporters/utils.py                    |  24 +-
 optimum/onnx/utils.py                         |  16 +
 optimum/onnxruntime/__init__.py               |   8 +
 optimum/onnxruntime/base.py                   |  19 +
 optimum/onnxruntime/modeling_diffusion.py     | 967 ++++++++++--------
 optimum/onnxruntime/modeling_ort.py           |   2 +-
 optimum/onnxruntime/modeling_seq2seq.py       |   2 +-
 optimum/onnxruntime/utils.py                  |  15 +
 .../diffusers/pipeline_latent_consistency.py  | 230 -----
 .../diffusers/pipeline_stable_diffusion.py    | 427 --------
 .../pipeline_stable_diffusion_img2img.py      | 309 ------
 .../pipeline_stable_diffusion_inpaint.py      | 353 -------
 .../diffusers/pipeline_stable_diffusion_xl.py | 506 ---------
 .../pipeline_stable_diffusion_xl_img2img.py   | 515 ----------
 optimum/pipelines/diffusers/pipeline_utils.py | 282 -----
 optimum/pipelines/diffusers/watermark.py      |  31 -
 tests/exporters/onnx/test_onnx_export.py      |  15 +-
 tests/onnxruntime/test_diffusion.py           | 578 +++++------
 tests/onnxruntime/test_modeling.py            |  14 +-
 tests/onnxruntime/utils_onnxruntime_tests.py  |   5 +
 21 files changed, 914 insertions(+), 3410 deletions(-)
 delete mode 100644 optimum/pipelines/diffusers/pipeline_latent_consistency.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
 delete mode 100644 optimum/pipelines/diffusers/pipeline_utils.py
 delete mode 100644 optimum/pipelines/diffusers/watermark.py

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 36963a986d0..e77f649f69b 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1112,7 +1112,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
 
 
 class VaeEncoderOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-2
+    ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1132,12 +1132,12 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
+            "latent_parameters": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
         }
 
 
 class VaeDecoderOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-3
+    ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index e2125736c4d..949b54f4685 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -46,11 +46,6 @@
 
     from diffusers import (
         DiffusionPipeline,
-        LatentConsistencyModelImg2ImgPipeline,
-        LatentConsistencyModelPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionPipeline,
         StableDiffusionXLImg2ImgPipeline,
         StableDiffusionXLInpaintPipeline,
         StableDiffusionXLPipeline,
@@ -92,27 +87,13 @@ def _get_submodels_for_export_diffusion(
     Returns the components of a Stable Diffusion model.
     """
 
-    is_stable_diffusion = isinstance(
-        pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline)
-    )
     is_stable_diffusion_xl = isinstance(
         pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline)
     )
-    is_latent_consistency_model = isinstance(
-        pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline)
-    )
-
     if is_stable_diffusion_xl:
         projection_dim = pipeline.text_encoder_2.config.projection_dim
-    elif is_stable_diffusion:
-        projection_dim = pipeline.text_encoder.config.projection_dim
-    elif is_latent_consistency_model:
-        projection_dim = pipeline.text_encoder.config.projection_dim
     else:
-        raise ValueError(
-            f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. "
-            "Please open an issue or submit a PR to add the support."
-        )
+        projection_dim = pipeline.text_encoder.config.projection_dim
 
     models_for_export = {}
 
@@ -139,7 +120,8 @@ def _get_submodels_for_export_diffusion(
     vae_encoder = copy.deepcopy(pipeline.vae)
     if not is_torch_greater_or_equal_than_2_1:
         vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder)
-    vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()}
+    # we return the distribution parameters to be able to recreate it in the decoder
+    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
     models_for_export["vae_encoder"] = vae_encoder
 
     # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py
index b52c4f4cdac..c014c1b3429 100644
--- a/optimum/onnx/utils.py
+++ b/optimum/onnx/utils.py
@@ -71,6 +71,22 @@ def _get_external_data_paths(src_paths: List[Path], dst_paths: List[Path]) -> Tu
     return src_paths, dst_paths
 
 
+def _get_model_external_data_paths(model_path: Path) -> List[Path]:
+    """
+    Gets external data paths from the model.
+    """
+
+    onnx_model = onnx.load(str(model_path), load_external_data=False)
+    model_tensors = _get_initializer_tensors(onnx_model)
+    # filter out tensors that are not external data
+    model_tensors_ext = [
+        ExternalDataInfo(tensor).location
+        for tensor in model_tensors
+        if tensor.HasField("data_location") and tensor.data_location == onnx.TensorProto.EXTERNAL
+    ]
+    return [model_path.parent / tensor_name for tensor_name in model_tensors_ext]
+
+
 def check_model_uses_external_data(model: onnx.ModelProto) -> bool:
     """
     Checks if the model uses external data.
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 1cb5b7c47b9..4e25a436909 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -79,7 +79,9 @@
         "ORTStableDiffusionInpaintPipeline",
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
+        "ORTStableDiffusionXLInpaintPipeline",
         "ORTLatentConsistencyModelPipeline",
+        "ORTLatentConsistencyModelImg2ImgPipeline",
         "ORTPipelineForImage2Image",
         "ORTPipelineForInpainting",
         "ORTPipelineForText2Image",
@@ -92,6 +94,8 @@
         "ORTStableDiffusionInpaintPipeline",
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
+        "ORTStableDiffusionXLInpaintPipeline",
+        "ORTLatentConsistencyModelImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
         "ORTPipelineForImage2Image",
         "ORTPipelineForInpainting",
@@ -148,6 +152,7 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_diffusers_objects import (
             ORTDiffusionPipeline,
+            ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
@@ -156,11 +161,13 @@
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
             ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
         )
     else:
         from .modeling_diffusion import (
             ORTDiffusionPipeline,
+            ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
@@ -169,6 +176,7 @@
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
             ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
         )
 else:
diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index 0e54bafed78..845780cafad 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -71,6 +71,25 @@ def dtype(self):
 
         return None
 
+    def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, torch.device):
+                device = arg
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if device is not None and device != self.device:
+            raise ValueError(
+                "Cannot change the device of a model part without changing the device of the parent model. "
+                "Please use the `to` method of the parent model to change the device."
+            )
+
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the model from {self.dtype} to {dtype}. "
+                f"Please export the model with the desired dtype."
+            )
+
     @abstractmethod
     def forward(self, *args, **kwargs):
         pass
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 18cd38c5f29..87fcb68c7e9 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -13,10 +13,11 @@
 #  limitations under the License.
 
 import importlib
+import inspect
 import logging
 import os
 import shutil
-import warnings
+from abc import abstractmethod
 from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -24,23 +25,25 @@
 
 import numpy as np
 import torch
-from diffusers import (
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+from diffusers.pipelines import (
     AutoPipelineForImage2Image,
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
-    ConfigMixin,
-    DDIMScheduler,
+    LatentConsistencyModelImg2ImgPipeline,
     LatentConsistencyModelPipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
 )
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import SchedulerMixin
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
+from diffusers.utils.constants import CONFIG_NAME
 from huggingface_hub import snapshot_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from huggingface_hub.utils import validate_hf_hub_args
@@ -51,14 +54,7 @@
 import onnxruntime as ort
 
 from ..exporters.onnx import main_export
-from ..onnx.utils import _get_external_data_paths
-from ..pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
-from ..pipelines.diffusers.pipeline_utils import VaeImageProcessor
+from ..onnx.utils import _get_model_external_data_paths
 from ..utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
@@ -66,12 +62,12 @@
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
-from .base import ORTModelPart
 from .io_binding import TypeHelper
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .utils import (
     ONNX_WEIGHTS_NAME,
     get_provider_for_device,
+    np_to_pt_generators,
     parse_device,
     validate_provider_availability,
 )
@@ -80,380 +76,287 @@
 logger = logging.getLogger(__name__)
 
 
-class ORTPipeline(ORTModel):
-    auto_model_class = None
-    model_type = "onnx_pipeline"
-
+# TODO: support from_pipe()
+# TODO: Instead of ORTModel, it makes sense to have a compositional ORTMixin
+# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline
+class ORTDiffusionPipeline(ORTModel, DiffusionPipeline):
     config_name = "model_index.json"
-    sub_component_config_name = "config.json"
+    auto_model_class = DiffusionPipeline
 
     def __init__(
         self,
-        vae_decoder_session: ort.InferenceSession,
+        scheduler: "SchedulerMixin",
         unet_session: ort.InferenceSession,
-        tokenizer: CLIPTokenizer,
-        config: Dict[str, Any],
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        feature_extractor: Optional[CLIPFeatureExtractor] = None,
+        vae_decoder_session: ort.InferenceSession,
+        # optional pipeline models
         vae_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_2_session: Optional[ort.InferenceSession] = None,
-        tokenizer_2: Optional[CLIPTokenizer] = None,
+        # optional pipeline submodels
+        tokenizer: Optional["CLIPTokenizer"] = None,
+        tokenizer_2: Optional["CLIPTokenizer"] = None,
+        feature_extractor: Optional["CLIPFeatureExtractor"] = None,
+        # stable diffusion xl specific arguments
+        force_zeros_for_empty_prompt: bool = True,
+        requires_aesthetics_score: bool = False,
+        add_watermarker: Optional[bool] = None,
+        # onnxruntime specific arguments
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        **kwargs,
     ):
-        """
-        Args:
-            vae_decoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the VAE decoder
-            unet_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the U-NET.
-            tokenizer (`CLIPTokenizer`):
-                Tokenizer of class
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
-                for the text encoder.
-            config (`Dict[str, Any]`):
-                A config dictionary from which the model components will be instantiated. Make sure to only load
-                configuration files of compatible classes.
-            scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`):
-                A scheduler to be used in combination with the U-NET component to denoise the encoded image latents.
-            feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`):
-                A model extracting features from generated images to be used as inputs for the `safety_checker`
-            vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
-                The ONNX Runtime inference session associated to the VAE encoder.
-            text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
-                The ONNX Runtime inference session associated to the text encoder.
-            tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`):
-                Tokenizer of class
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
-                for the second text encoder.
-            use_io_binding (`Optional[bool]`, defaults to `None`):
-                Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
-                `True` if the device is CUDA, otherwise defaults to `False`.
-            model_save_dir (`Optional[str]`, defaults to `None`):
-                The directory under which the model exported to ONNX was saved.
-        """
-        self.shared_attributes_init(
-            model=vae_decoder_session,
-            use_io_binding=use_io_binding,
-            model_save_dir=model_save_dir,
-        )
-        self._internal_dict = config
-        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self)
-        self.vae_decoder_model_path = Path(vae_decoder_session._model_path)
         self.unet = ORTModelUnet(unet_session, self)
-        self.unet_model_path = Path(unet_session._model_path)
-
-        if text_encoder_session is not None:
-            self.text_encoder_model_path = Path(text_encoder_session._model_path)
-            self.text_encoder = ORTModelTextEncoder(text_encoder_session, self)
-        else:
-            self.text_encoder_model_path = None
-            self.text_encoder = None
-
-        if vae_encoder_session is not None:
-            self.vae_encoder_model_path = Path(vae_encoder_session._model_path)
-            self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self)
-        else:
-            self.vae_encoder_model_path = None
-            self.vae_encoder = None
+        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self)
+        self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None
+        self.text_encoder = (
+            ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None
+        )
+        self.text_encoder_2 = (
+            ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None
+        )
+        # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API
+        self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder)
 
-        if text_encoder_2_session is not None:
-            self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path)
-            self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self)
-        else:
-            self.text_encoder_2_model_path = None
-            self.text_encoder_2 = None
+        # we allow passing these as torch models for now
+        self.image_encoder = kwargs.pop("image_encoder", None)  # TODO: maybe implement ORTModelImageEncoder
+        self.safety_checker = kwargs.pop("safety_checker", None)  # TODO: maybe implement ORTModelSafetyChecker
 
+        self.scheduler = scheduler
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
-        self.scheduler = scheduler
         self.feature_extractor = feature_extractor
-        self.safety_checker = None
-
-        sub_models = {
-            DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
-            DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
-            DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
-            DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
-            DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
-        }
-
-        # Modify config to keep the resulting model compatible with diffusers pipelines
-        for name in sub_models.keys():
-            self._internal_dict[name] = (
-                ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None)
-            )
-        self._internal_dict.pop("vae", None)
-
-        if "block_out_channels" in self.vae_decoder.config:
-            self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
-        else:
-            self.vae_scale_factor = 8
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    @staticmethod
-    def load_model(
-        vae_decoder_path: Union[str, Path],
-        text_encoder_path: Union[str, Path],
-        unet_path: Union[str, Path],
-        vae_encoder_path: Optional[Union[str, Path]] = None,
-        text_encoder_2_path: Optional[Union[str, Path]] = None,
-        provider: str = "CPUExecutionProvider",
-        session_options: Optional[ort.SessionOptions] = None,
-        provider_options: Optional[Dict] = None,
-    ):
-        """
-        Creates three inference sessions for respectively the VAE decoder, the text encoder and the U-NET models.
-        The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX.
 
-        Args:
-            vae_decoder_path (`Union[str, Path]`):
-                The path to the VAE decoder ONNX model.
-            text_encoder_path (`Union[str, Path]`):
-                The path to the text encoder ONNX model.
-            unet_path (`Union[str, Path]`):
-                The path to the U-NET ONNX model.
-            vae_encoder_path (`Union[str, Path]`, defaults to `None`):
-                The path to the VAE encoder ONNX model.
-            text_encoder_2_path (`Union[str, Path]`, defaults to `None`):
-                The path to the second text decoder ONNX model.
-            provider (`str`, defaults to `"CPUExecutionProvider"`):
-                ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/
-                for possible providers.
-            session_options (`Optional[ort.SessionOptions]`, defaults to `None`):
-                ONNX Runtime session options to use for loading the model. Defaults to `None`.
-            provider_options (`Optional[Dict]`, defaults to `None`):
-                Provider option dictionary corresponding to the provider used. See available options
-                for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`.
-        """
-        vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options)
-        unet = ORTModel.load_model(unet_path, provider, session_options, provider_options)
-
-        sessions = {
-            "vae_encoder": vae_encoder_path,
-            "text_encoder": text_encoder_path,
-            "text_encoder_2": text_encoder_2_path,
+        all_pipeline_init_args = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+            "scheduler": self.scheduler,
+            "tokenizer": self.tokenizer,
+            "tokenizer_2": self.tokenizer_2,
+            "feature_extractor": self.feature_extractor,
+            "requires_aesthetics_score": requires_aesthetics_score,
+            "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt,
+            "add_watermarker": add_watermarker,
         }
 
-        for key, value in sessions.items():
-            if value is not None and value.is_file():
-                sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options)
-            else:
-                sessions[key] = None
+        diffusers_pipeline_args = {}
+        for key in inspect.signature(self.auto_model_class).parameters.keys():
+            if key in all_pipeline_init_args:
+                diffusers_pipeline_args[key] = all_pipeline_init_args[key]
+        # inits diffusers pipeline specific attributes (registers modules and config)
+        self.auto_model_class.__init__(self, **diffusers_pipeline_args)
 
-        return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"]
+        # inits ort specific attributes
+        self.shared_attributes_init(
+            model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs
+        )
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         save_directory = Path(save_directory)
-        src_to_dst_path = {
-            self.vae_decoder_model_path: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME,
-            self.text_encoder_model_path: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME,
-            self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME,
-        }
 
-        sub_models_to_save = {
-            self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-            self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+        models_to_save_paths = {
+            (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER),
+            (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER),
+            (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER),
+            (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER),
+            (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER),
         }
-        for path, subfolder in sub_models_to_save.items():
-            if path is not None:
-                src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME
-
-        # TODO: Modify _get_external_data_paths to give dictionnary
-        src_paths = list(src_to_dst_path.keys())
-        dst_paths = list(src_to_dst_path.values())
-        # Add external data paths in case of large models
-        src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths)
-
-        for src_path, dst_path in zip(src_paths, dst_paths):
-            dst_path.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copyfile(src_path, dst_path)
-            config_path = src_path.parent / self.sub_component_config_name
-            if config_path.is_file():
-                shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name)
+        for model, save_path in models_to_save_paths:
+            if model is not None:
+                model_path = Path(model.session._model_path)
+                save_path.mkdir(parents=True, exist_ok=True)
+                # copy onnx model
+                shutil.copyfile(model_path, save_path / ONNX_WEIGHTS_NAME)
+                # copy external onnx data
+                external_data_paths = _get_model_external_data_paths(model_path)
+                for external_data_path in external_data_paths:
+                    shutil.copyfile(external_data_path, save_path / external_data_path.name)
+                # copy model config
+                config_path = model_path.parent / CONFIG_NAME
+                if config_path.is_file():
+                    config_save_path = save_path / CONFIG_NAME
+                    shutil.copyfile(config_path, config_save_path)
 
         self.scheduler.save_pretrained(save_directory / "scheduler")
 
-        if self.feature_extractor is not None:
-            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory / "tokenizer")
         if self.tokenizer_2 is not None:
             self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
 
     @classmethod
     def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: Dict[str, Any],
-        use_auth_token: Optional[Union[bool, str]] = None,
-        token: Optional[Union[bool, str]] = None,
+        subfolder: str = "",
+        force_download: bool = False,
+        local_files_only: bool = False,
         revision: Optional[str] = None,
+        trust_remote_code: bool = False,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
-        text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
+        token: Optional[Union[bool, str]] = None,
         unet_file_name: str = ONNX_WEIGHTS_NAME,
+        vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
         vae_encoder_file_name: str = ONNX_WEIGHTS_NAME,
+        text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME,
-        local_files_only: bool = False,
+        use_io_binding: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
-        session_options: Optional[ort.SessionOptions] = None,
         provider_options: Optional[Dict[str, Any]] = None,
-        use_io_binding: Optional[bool] = None,
+        session_options: Optional[ort.SessionOptions] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
+        if use_io_binding:
+            raise ValueError(
+                "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False."
             )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
-        if provider == "TensorrtExecutionProvider":
-            raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported")
 
-        model_id = str(model_id)
-        patterns = set(config.keys())
-        sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"})
-
-        if not os.path.isdir(model_id):
-            patterns.update({"vae_encoder", "vae_decoder"})
-            allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
+        if not os.path.isdir(str(model_id)):
+            all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"}
+            allow_patterns = {os.path.join(component, "*") for component in all_components}
             allow_patterns.update(
                 {
-                    vae_decoder_file_name,
-                    text_encoder_file_name,
                     unet_file_name,
+                    vae_decoder_file_name,
                     vae_encoder_file_name,
+                    text_encoder_file_name,
                     text_encoder_2_file_name,
                     SCHEDULER_CONFIG_NAME,
-                    CONFIG_NAME,
                     cls.config_name,
+                    CONFIG_NAME,
                 }
             )
-            # Downloads all repo's files matching the allowed patterns
-            model_id = snapshot_download(
+            model_save_folder = snapshot_download(
                 model_id,
                 cache_dir=cache_dir,
+                force_download=force_download,
                 local_files_only=local_files_only,
-                token=token,
                 revision=revision,
+                token=token,
                 allow_patterns=allow_patterns,
                 ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"],
             )
-        new_model_save_dir = Path(model_id)
+        else:
+            model_save_folder = str(model_id)
+
+        model_save_path = Path(model_save_folder)
+
+        if model_save_dir is None:
+            model_save_dir = model_save_path
+
+        model_paths = {
+            "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
+            "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
+            "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
+            "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
+            "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
+        }
+
+        sessions = {}
+        for model, path in model_paths.items():
+            if kwargs.get(model, None) is not None:
+                # this allows passing a model directly to from_pretrained
+                sessions[f"{model}_session"] = kwargs.pop(model)
+            else:
+                sessions[f"{model}_session"] = (
+                    ORTModel.load_model(path, provider, session_options, provider_options) if path.is_file() else None
+                )
 
-        sub_models = {}
-        for name in sub_models_to_load:
-            library_name, library_classes = config[name]
-            if library_classes is not None:
+        submodels = {}
+        for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}:
+            if kwargs.get(submodel, None) is not None:
+                submodels[submodel] = kwargs.pop(submodel)
+            elif config.get(submodel, (None, None))[0] is not None:
+                library_name, library_classes = config.get(submodel)
                 library = importlib.import_module(library_name)
                 class_obj = getattr(library, library_classes)
                 load_method = getattr(class_obj, "from_pretrained")
                 # Check if the module is in a subdirectory
-                if (new_model_save_dir / name).is_dir():
-                    sub_models[name] = load_method(new_model_save_dir / name)
+                if (model_save_path / submodel).is_dir():
+                    submodels[submodel] = load_method(model_save_path / submodel)
                 else:
-                    sub_models[name] = load_method(new_model_save_dir)
-
-        vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model(
-            vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
-            text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
-            unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
-            vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            text_encoder_2_path=(
-                new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name
-            ),
-            provider=provider,
-            session_options=session_options,
-            provider_options=provider_options,
-        )
-
-        if model_save_dir is None:
-            model_save_dir = new_model_save_dir
+                    submodels[submodel] = load_method(model_save_path)
 
-        if use_io_binding:
-            raise ValueError(
-                "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False."
-            )
+        # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config
+        if cls.__name__ == "ORTDiffusionPipeline":
+            class_name = config["_class_name"]
+            ort_pipeline_class = _get_ort_class(class_name)
+        else:
+            ort_pipeline_class = cls
 
-        return cls(
-            vae_decoder_session=vae_decoder,
-            text_encoder_session=text_encoder,
-            unet_session=unet,
-            config=config,
-            tokenizer=sub_models.get("tokenizer", None),
-            scheduler=sub_models.get("scheduler"),
-            feature_extractor=sub_models.get("feature_extractor", None),
-            tokenizer_2=sub_models.get("tokenizer_2", None),
-            vae_encoder_session=vae_encoder,
-            text_encoder_2_session=text_encoder_2,
+        ort_pipeline = ort_pipeline_class(
+            **sessions,
+            **submodels,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
+            **kwargs,
         )
 
+        # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from
+        ort_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id)))
+
+        return ort_pipeline
+
     @classmethod
-    def _from_transformers(
+    def _export(
         cls,
         model_id: str,
-        config: Optional[str] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        token: Optional[Union[bool, str]] = None,
-        revision: str = "main",
-        force_download: bool = True,
-        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        config: Dict[str, Any],
         subfolder: str = "",
+        force_download: bool = False,
         local_files_only: bool = False,
+        revision: Optional[str] = None,
         trust_remote_code: bool = False,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
+        use_io_binding: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
         session_options: Optional[ort.SessionOptions] = None,
         provider_options: Optional[Dict[str, Any]] = None,
-        use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTPipeline":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        **kwargs,
+    ) -> "ORTDiffusionPipeline":
         if task is None:
             task = cls._auto_model_to_task(cls.auto_model_class)
 
-        save_dir = TemporaryDirectory()
-        save_dir_path = Path(save_dir.name)
+        # we continue passing the model_save_dir from here on to avoid it being cleaned up
+        # might be better to use a persistent temporary directory such as the one implemented in
+        # https://gist.github.com/twolfson/2929dc1163b0a76d2c2b66d51f9bc808
+        model_save_dir = TemporaryDirectory()
+        model_save_path = Path(model_save_dir.name)
 
         main_export(
-            model_name_or_path=model_id,
-            output=save_dir_path,
-            task=task,
+            model_id,
+            output=model_save_path,
             do_validation=False,
             no_post_process=True,
-            subfolder=subfolder,
+            token=token,
             revision=revision,
             cache_dir=cache_dir,
-            token=token,
-            local_files_only=local_files_only,
+            subfolder=subfolder,
             force_download=force_download,
+            local_files_only=local_files_only,
             trust_remote_code=trust_remote_code,
+            library_name="diffusers",
+            task=task,
         )
 
         return cls._from_pretrained(
-            save_dir_path,
+            model_save_path,
             config=config,
             provider=provider,
-            session_options=session_options,
             provider_options=provider_options,
+            session_options=session_options,
             use_io_binding=use_io_binding,
-            model_save_dir=save_dir,
+            model_save_dir=model_save_dir,
+            **kwargs,
         )
 
     def to(self, device: Union[torch.device, str, int]):
@@ -471,19 +374,22 @@ def to(self, device: Union[torch.device, str, int]):
 
         device, provider_options = parse_device(device)
         provider = get_provider_for_device(device)
-        validate_provider_availability(provider)  # raise error if the provider is not available
+        validate_provider_availability(provider)
 
         if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
             return self
 
-        self.vae_decoder.session.set_providers([provider], provider_options=[provider_options])
-        self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
         self.unet.session.set_providers([provider], provider_options=[provider_options])
+        self.vae_decoder.session.set_providers([provider], provider_options=[provider_options])
 
         if self.vae_encoder is not None:
             self.vae_encoder.session.set_providers([provider], provider_options=[provider_options])
+        if self.text_encoder is not None:
+            self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
+        if self.text_encoder_2 is not None:
+            self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options])
 
-        self.providers = self.vae_decoder.session.get_providers()
+        self.providers = self.unet.session.get_providers()
         self._device = device
 
         return self
@@ -495,41 +401,142 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
     def _save_config(self, save_directory):
         self.save_config(save_directory)
 
+    @property
+    def components(self) -> Dict[str, Any]:
+        components = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+        }
+        components = {k: v for k, v in components.items() if v is not None}
+        return components
 
-class ORTPipelinePart(ORTModelPart):
-    CONFIG_NAME = "config.json"
+    def __call__(self, *args, **kwargs):
+        # we do this to keep numpy random states support for now
+        # TODO: deprecate and add warnings when a random state is passed
 
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline):
-        config_path = Path(session._model_path).parent / self.CONFIG_NAME
+        args = list(args)
+        for i in range(len(args)):
+            args[i] = np_to_pt_generators(args[i], self.device)
 
-        if config_path.is_file():
-            # TODO: use FrozenDict
-            self.config = parent_model._dict_from_json_file(config_path)
-        else:
-            self.config = {}
+        for k, v in kwargs.items():
+            kwargs[k] = np_to_pt_generators(v, self.device)
+
+        return self.auto_model_class.__call__(self, *args, **kwargs)
 
-        super().__init__(session, parent_model)
+
+class ORTPipelinePart(ConfigMixin):
+    config_name: str = CONFIG_NAME
+
+    def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionPipeline):
+        self.session = session
+        self.parent_pipeline = parent_pipeline
+
+        self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
+        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+        self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()}
+        self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()}
+
+        config_file_path = Path(session._model_path).parent / self.config_name
+        if not config_file_path.is_file():
+            # config is mandatory for the model part to be used for inference
+            raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}")
+        config_dict = self._dict_from_json_file(config_file_path)
+        self.register_to_config(**config_dict)
 
     @property
-    def input_dtype(self):
-        # for backward compatibility and diffusion mixins (will be standardized in the future)
-        return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()}
+    def device(self):
+        return self.parent_pipeline.device
 
+    @property
+    def dtype(self):
+        for dtype in self.input_dtypes.values():
+            torch_dtype = TypeHelper.ort_type_to_torch_type(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        for dtype in self.output_dtypes.values():
+            torch_dtype = TypeHelper.ort_type_to_torch_type(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
+    def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, torch.device):
+                device = arg
+            elif isinstance(arg, (int, str)):
+                device = torch.device(arg)
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if device is not None and device != self.device:
+            raise ValueError(
+                "Cannot change the device of a pipeline part without changing the device of the parent pipeline. "
+                "Please use the `to` method of the parent pipeline to change the device."
+            )
 
-class ORTModelTextEncoder(ORTPipelinePart):
-    def forward(self, input_ids: Union[np.ndarray, torch.Tensor]):
-        use_torch = isinstance(input_ids, torch.Tensor)
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the pipeline from {self.dtype} to {dtype}. "
+                f"Please export the pipeline with the desired dtype."
+            )
 
-        model_inputs = {"input_ids": input_ids}
+    def prepare_onnx_inputs(self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]) -> Dict[str, np.ndarray]:
+        onnx_inputs = {}
 
-        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
-        onnx_outputs = self.session.run(None, onnx_inputs)
-        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+        # converts pytorch inputs into numpy inputs for onnx
+        for input_name in self.input_names.keys():
+            onnx_inputs[input_name] = inputs.pop(input_name)
 
-        return ModelOutput(**model_outputs)
+            if use_torch:
+                onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True)
+
+            if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]:
+                onnx_inputs[input_name] = onnx_inputs[input_name].astype(
+                    TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name])
+                )
+
+        return onnx_inputs
+
+    def prepare_onnx_outputs(
+        self, use_torch: bool, *onnx_outputs: np.ndarray
+    ) -> Dict[str, Union[torch.Tensor, np.ndarray]]:
+        model_outputs = {}
+
+        # converts onnxruntime outputs into tensor for standard outputs
+        for output_name, idx in self.output_names.items():
+            model_outputs[output_name] = onnx_outputs[idx]
+
+            if use_torch:
+                model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device)
+
+        return model_outputs
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
 
 
 class ORTModelUnet(ORTPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "time_cond_proj_dim"):
+            logger.warning(
+                "The `time_cond_proj_dim` attribute is missing from the UNet configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(time_cond_proj_dim=None)
+
     def forward(
         self,
         sample: Union[np.ndarray, torch.Tensor],
@@ -538,9 +545,15 @@ def forward(
         text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
         time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
         timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
     ):
         use_torch = isinstance(sample, torch.Tensor)
 
+        if len(timestep.shape) == 0:
+            timestep = timestep.unsqueeze(0)
+
         model_inputs = {
             "sample": sample,
             "timestep": timestep,
@@ -548,171 +561,323 @@ def forward(
             "text_embeds": text_embeds,
             "time_ids": time_ids,
             "timestep_cond": timestep_cond,
+            **(cross_attention_kwargs or {}),
+            **(added_cond_kwargs or {}),
         }
 
-        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
         onnx_outputs = self.session.run(None, onnx_inputs)
-        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if return_dict:
+            return model_outputs
 
         return ModelOutput(**model_outputs)
 
 
-class ORTModelVaeDecoder(ORTPipelinePart):
-    def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]):
-        use_torch = isinstance(latent_sample, torch.Tensor)
+class ORTModelTextEncoder(ORTPipelinePart):
+    def forward(
+        self,
+        input_ids: Union[np.ndarray, torch.Tensor],
+        attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(input_ids, torch.Tensor)
 
-        model_inputs = {"latent_sample": latent_sample}
+        model_inputs = {"input_ids": input_ids}
 
-        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
         onnx_outputs = self.session.run(None, onnx_inputs)
-        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if output_hidden_states:
+            model_outputs["hidden_states"] = []
+            for i in range(self.config.num_hidden_layers):
+                model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}"))
+            model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state"))
+        else:
+            for i in range(self.config.num_hidden_layers):
+                model_outputs.pop(f"hidden_states.{i}", None)
+
+        if return_dict:
+            return model_outputs
 
         return ModelOutput(**model_outputs)
 
 
 class ORTModelVaeEncoder(ORTPipelinePart):
-    def forward(self, sample: Union[np.ndarray, torch.Tensor]):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE encoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
         use_torch = isinstance(sample, torch.Tensor)
 
         model_inputs = {"sample": sample}
 
-        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
         onnx_outputs = self.session.run(None, onnx_inputs)
-        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if "latent_parameters" in model_outputs:
+            model_outputs["latent_dist"] = DiagonalGaussianDistribution(
+                parameters=model_outputs.pop("latent_parameters")
+            )
+
+        if return_dict:
+            return model_outputs
 
         return ModelOutput(**model_outputs)
 
 
+class ORTModelVaeDecoder(ORTPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE decoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        latent_sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(latent_sample, torch.Tensor)
+
+        model_inputs = {"latent_sample": latent_sample}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTWrapperVae(ORTPipelinePart):
+    def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder):
+        self.decoder = decoder
+        self.encoder = encoder
+
+    @property
+    def config(self):
+        return self.decoder.config
+
+    @property
+    def dtype(self):
+        return self.decoder.dtype
+
+    @property
+    def device(self):
+        return self.decoder.device
+
+    def decode(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+    def encode(self, *args, **kwargs):
+        return self.encoder(*args, **kwargs)
+
+    def to(self, *args, **kwargs):
+        self.decoder.to(*args, **kwargs)
+        if self.encoder is not None:
+            self.encoder.to(*args, **kwargs)
+
+
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
     main_input_name = "prompt"
+    export_feature = "text-to-image"
     auto_model_class = StableDiffusionPipeline
 
-    __call__ = StableDiffusionPipelineMixin.__call__
-
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
-    main_input_name = "prompt"
+    main_input_name = "image"
+    export_feature = "image-to-image"
     auto_model_class = StableDiffusionImg2ImgPipeline
 
-    __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
-
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
     main_input_name = "prompt"
+    export_feature = "inpainting"
     auto_model_class = StableDiffusionInpaintPipeline
 
-    __call__ = StableDiffusionInpaintPipelineMixin.__call__
-
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin):
+class ORTStableDiffusionXLPipeline(ORTDiffusionPipeline, StableDiffusionXLPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
     main_input_name = "prompt"
-    auto_model_class = LatentConsistencyModelPipeline
-
-    __call__ = LatentConsistencyPipelineMixin.__call__
-
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionXLPipeline
 
-class ORTStableDiffusionXLPipelineBase(ORTPipeline):
-    def __init__(
+    def _get_add_time_ids(
         self,
-        vae_decoder_session: ort.InferenceSession,
-        text_encoder_session: ort.InferenceSession,
-        unet_session: ort.InferenceSession,
-        config: Dict[str, Any],
-        tokenizer: CLIPTokenizer,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        feature_extractor: Optional[CLIPFeatureExtractor] = None,
-        vae_encoder_session: Optional[ort.InferenceSession] = None,
-        text_encoder_2_session: Optional[ort.InferenceSession] = None,
-        tokenizer_2: Optional[CLIPTokenizer] = None,
-        use_io_binding: Optional[bool] = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        add_watermarker: Optional[bool] = None,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        dtype,
+        text_encoder_projection_dim=None,
     ):
-        super().__init__(
-            vae_decoder_session=vae_decoder_session,
-            text_encoder_session=text_encoder_session,
-            unet_session=unet_session,
-            config=config,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            vae_encoder_session=vae_encoder_session,
-            text_encoder_2_session=text_encoder_2_session,
-            tokenizer_2=tokenizer_2,
-            use_io_binding=use_io_binding,
-            model_save_dir=model_save_dir,
-        )
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
 
-        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
 
-        if add_watermarker:
-            if not is_invisible_watermark_available():
-                raise ImportError(
-                    "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`."
-                )
 
-            from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTStableDiffusionXLImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionXLImg2ImgPipeline):
+    """
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
+    """
 
-            self.watermark = StableDiffusionXLWatermarker()
+    main_input_name = "prompt"
+    export_feature = "image-to-image"
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
         else:
-            self.watermark = None
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin):
+class ORTStableDiffusionXLInpaintPipeline(ORTDiffusionPipeline, StableDiffusionXLInpaintPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline).
     """
 
-    main_input_name = "prompt"
-    auto_model_class = StableDiffusionXLPipeline
+    main_input_name = "image"
+    export_feature = "inpainting"
+    auto_model_class = StableDiffusionXLInpaintPipeline
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
 
-    __call__ = StableDiffusionXLPipelineMixin.__call__
+        return add_time_ids, add_neg_time_ids
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyModelPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
     main_input_name = "prompt"
-    auto_model_class = StableDiffusionXLImg2ImgPipeline
+    export_feature = "text-to-image"
+    auto_model_class = LatentConsistencyModelPipeline
+
+
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline):
+    """
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline).
+    """
 
-    __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = LatentConsistencyModelImg2ImgPipeline
 
 
 SUPPORTED_ORT_PIPELINES = [
     ORTStableDiffusionPipeline,
     ORTStableDiffusionImg2ImgPipeline,
     ORTStableDiffusionInpaintPipeline,
-    ORTLatentConsistencyModelPipeline,
     ORTStableDiffusionXLPipeline,
     ORTStableDiffusionXLImg2ImgPipeline,
+    ORTStableDiffusionXLInpaintPipeline,
+    ORTLatentConsistencyModelPipeline,
+    ORTLatentConsistencyModelImg2ImgPipeline,
 ]
 
 
-def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
+def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
     for ort_pipeline_class in SUPPORTED_ORT_PIPELINES:
         if (
             ort_pipeline_class.__name__ == pipeline_class_name
@@ -724,31 +889,6 @@ def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool
         raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}")
 
 
-class ORTDiffusionPipeline(ConfigMixin):
-    config_name = "model_index.json"
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
-        load_config_kwargs = {
-            "force_download": kwargs.get("force_download", False),
-            "resume_download": kwargs.get("resume_download", None),
-            "local_files_only": kwargs.get("local_files_only", False),
-            "cache_dir": kwargs.get("cache_dir", None),
-            "revision": kwargs.get("revision", None),
-            "proxies": kwargs.get("proxies", None),
-            "token": kwargs.get("token", None),
-        }
-
-        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
-        config = config[0] if isinstance(config, tuple) else config
-        class_name = config["_class_name"]
-
-        ort_pipeline_class = _get_pipeline_class(class_name)
-
-        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
-
-
 ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
         ("stable-diffusion", ORTStableDiffusionPipeline),
@@ -761,12 +901,14 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
     [
         ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
+        ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline),
     ]
 )
 
 ORT_INPAINT_PIPELINES_MAPPING = OrderedDict(
     [
         ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline),
     ]
 )
 
@@ -777,7 +919,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
 ]
 
 
-def _get_task_class(mapping, pipeline_class_name):
+def _get_task_ort_class(mapping, pipeline_class_name):
     def _get_model_name(pipeline_class_name):
         for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS:
             for model_name, ort_pipeline_class in ort_pipelines_mapping.items():
@@ -801,7 +943,8 @@ class ORTPipelineForTask(ConfigMixin):
     config_name = "model_index.json"
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTDiffusionPipeline:
         load_config_kwargs = {
             "force_download": kwargs.get("force_download", False),
             "resume_download": kwargs.get("resume_download", None),
@@ -815,7 +958,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         config = config[0] if isinstance(config, tuple) else config
         class_name = config["_class_name"]
 
-        ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name)
+        ort_pipeline_class = _get_task_ort_class(cls.ort_pipelines_mapping, class_name)
 
         return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
 
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 17bd3e2a4e7..9b29afa566b 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -938,7 +938,7 @@ def _prepare_onnx_inputs(
             onnx_inputs[input_name] = inputs.pop(input_name)
 
             if use_torch:
-                onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy()
+                onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True)
 
             if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]:
                 onnx_inputs[input_name] = onnx_inputs[input_name].astype(
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index fda3ca82bbe..27e0dc01b4c 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -67,7 +67,7 @@
 if check_if_transformers_greater("4.25.0"):
     from transformers.generation import GenerationMixin
 else:
-    from transformers.generation_utils import GenerationMixin
+    from transformers.generation_utils import GenerationMixin  # type: ignore
 
 
 if check_if_transformers_greater("4.43.0"):
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 985980e31b0..128e2406f11 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -403,3 +403,18 @@ def evaluation_loop(
         metrics = {}
 
     return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))
+
+
+def np_to_pt_generators(np_object, device):
+    if isinstance(np_object, np.random.RandomState):
+        return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0]))
+    elif isinstance(np_object, np.random.Generator):
+        return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0]))
+    elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)):
+        return [np_to_pt_generators(a, device) for a in np_object]
+    elif isinstance(np_object, dict) and isinstance(
+        next(iter(np_object.values())), (np.random.RandomState, np.random.Generator)
+    ):
+        return {k: np_to_pt_generators(v, device) for k, v in np_object.items()}
+    else:
+        return np_object
diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
deleted file mode 100644
index 630d463de73..00000000000
--- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import logging
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-logger = logging.getLogger(__name__)
-
-
-class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin):
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 4,
-        original_inference_steps: int = None,
-        guidance_scale: float = 8.5,
-        num_images_per_prompt: int = 1,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-
-        # Don't need to get negative prompts due to LCM guided distillation
-        negative_prompt = None
-        negative_prompt_embeds = None
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            False,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config["in_channels"],
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        bs = batch_size * num_images_per_prompt
-        # get Guidance Scale Embedding
-        w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
-        w_embedding = self.get_guidance_scale_embedding(
-            w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
-        )
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latents,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                timestep_cond=w_embedding,
-            )[0]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents, denoised = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
-            )
-            latents, denoised = latents.numpy(), denoised.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = denoised
-            has_nsfw_concept = None
-        else:
-            denoised /= self.vae_decoder.config["scaling_factor"]
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        w = w * 1000
-        half_dim = embedding_dim // 2
-        emb = np.log(10000.0) / (half_dim - 1)
-        emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
-        emb = w[:, None] * emb[None, :]
-        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
-
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = np.pad(emb, [(0, 0), (0, 1)])
-
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
deleted file mode 100644
index 6cc47fab1b9..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ /dev/null
@@ -1,427 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionPipelineMixin(DiffusionPipelineMixin):
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
-
-            if not np.array_equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            if isinstance(generator, np.random.RandomState):
-                latents = generator.randn(*shape).astype(dtype)
-            elif isinstance(generator, torch.Generator):
-                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
-            else:
-                raise ValueError(
-                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
-                    f" {type(generator)}."
-                )
-        elif latents.shape != shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        return latents
-
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        guidance_rescale: float = 0.0,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config.get("in_channels", 4),
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def run_safety_checker(self, image: np.ndarray):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="np"
-            ).pixel_values.astype(image.dtype)
-            images, has_nsfw_concept = [], []
-            for i in range(image.shape[0]):
-                image_i, has_nsfw_concept_i = self.safety_checker(
-                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
-                )
-                images.append(image_i)
-                has_nsfw_concept.append(has_nsfw_concept_i[0])
-            image = np.concatenate(images)
-
-        return image, has_nsfw_concept
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
deleted file mode 100644
index a66035a789b..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin):
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        strength: float,
-        callback_steps: int,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-        else:
-            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents], axis=0)
-
-        # add noise to latents using the timesteps
-        if isinstance(generator, np.random.RandomState):
-            noise = generator.randn(*init_latents.shape).astype(dtype)
-        elif isinstance(generator, torch.Generator):
-            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
-        else:
-            raise ValueError(
-                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
-                f" {type(generator)}."
-            )
-
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        ).numpy()
-
-        return init_latents
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`Union[np.ndarray, PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            strength (`float`, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
-                A np.random.RandomState to make generation deterministic.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        image = self.image_processor.preprocess(image)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        latents_dtype = prompt_embeds.dtype
-        image = image.astype(latents_dtype)
-        scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
-        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        timesteps = self.scheduler.timesteps[t_start:].numpy()
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    step_idx = i // getattr(self.scheduler, "order", 1)
-                    callback(step_idx, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= scaling_factor
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
deleted file mode 100644
index cb3c7db96e9..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,353 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import PIL_INTERPOLATION
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor):
-    image = np.array(
-        image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor))
-    )
-    image = image[None].transpose(0, 3, 1, 2)
-    image = image.astype(np.float32) / 127.5 - 1.0
-
-    image_mask = np.array(
-        mask.convert("L").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor))
-    )
-    masked_image = image * (image_mask < 127.5)
-
-    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"])
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-
-    return mask, masked_image
-
-
-class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin):
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: PIL.Image.Image,
-        mask_image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing a masked image batch which will be upscaled.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        num_channels_latents = self.vae_decoder.config.get("latent_channels", 4)
-        num_channels_unet = self.unet.config.get("in_channels", 9)
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        latents_dtype = prompt_embeds.dtype
-
-        if latents is None:
-            if isinstance(generator, np.random.RandomState):
-                latents = generator.randn(*latents_shape).astype(latents_dtype)
-            elif isinstance(generator, torch.Generator):
-                latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype)
-            else:
-                raise ValueError(
-                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
-                    f" {type(generator)}."
-                )
-        elif latents.shape != latents_shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        # prepare mask and masked_image
-        mask, masked_image = prepare_mask_and_masked_image(
-            image, mask_image, latents_shape[-2:], self.vae_scale_factor
-        )
-        mask = mask.astype(latents.dtype)
-        masked_image = masked_image.astype(latents.dtype)
-
-        masked_image_latents = self.vae_encoder(sample=masked_image)[0]
-
-        scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-        masked_image_latents = scaling_factor * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt
-        mask = mask.repeat(batch_size * num_images_per_prompt, 0)
-        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
-
-        mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: expects"
-                    f" {num_channels_unet} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {num_channels_unet}."
-            )
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            # concat latents, mask, masked_image_latnets in the channel dimension
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-            if num_channels_unet == 9:
-                latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= scaling_factor
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
deleted file mode 100644
index 0407c16a77a..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ /dev/null
@@ -1,506 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin):
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-
-        if prompt_embeds is None:
-            prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                # get prompt text embeddings
-                text_inputs = tokenizer(
-                    prompt,
-                    padding="max_length",
-                    max_length=tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                text_input_ids = text_inputs.input_ids
-                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
-
-                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
-                    text_input_ids, untruncated_ids
-                ):
-                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
-                    logger.warning(
-                        "The following part of your input was truncated because CLIP can only handle sequences up to"
-                        f" {tokenizer.model_max_length} tokens: {removed_text}"
-                    )
-
-                prompt_embeds = text_encoder(
-                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds[-2]
-                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-                prompt_embeds_list.append(prompt_embeds)
-
-            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
-        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
-            negative_prompt_embeds = np.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            negative_prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                negative_prompt_embeds = text_encoder(
-                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
-                negative_prompt_embeds = negative_prompt_embeds[-2]
-
-                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-            negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1)
-
-        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
-        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if prompt_embeds is not None and pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-
-        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            if isinstance(generator, np.random.RandomState):
-                latents = generator.randn(*shape).astype(dtype)
-            elif isinstance(generator, torch.Generator):
-                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
-            else:
-                raise ValueError(
-                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
-                    f" {type(generator)}."
-                )
-        elif latents.shape != shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        return latents
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        extra_step_kwargs = {}
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        return extra_step_kwargs
-
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.7):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config.get("in_channels", 4),
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare added time ids & embeddings
-        add_text_embeds = pooled_prompt_embeds
-        add_time_ids = (original_size + crops_coords_top_left + target_size,)
-        add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype)
-
-        if do_classifier_free_guidance:
-            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
-            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
-            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
-        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        # 8. Denoising loop
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                text_embeds=add_text_embeds,
-                time_ids=add_time_ids,
-            )
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    step_idx = i // getattr(self.scheduler, "order", 1)
-                    callback(step_idx, t, latents)
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        if not return_dict:
-            return (image,)
-
-        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
deleted file mode 100644
index 19988599b64..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ /dev/null
@@ -1,515 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin):
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-
-        if prompt_embeds is None:
-            prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                # get prompt text embeddings
-                text_inputs = tokenizer(
-                    prompt,
-                    padding="max_length",
-                    max_length=tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                text_input_ids = text_inputs.input_ids
-                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
-
-                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
-                    text_input_ids, untruncated_ids
-                ):
-                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
-                    logger.warning(
-                        "The following part of your input was truncated because CLIP can only handle sequences up to"
-                        f" {tokenizer.model_max_length} tokens: {removed_text}"
-                    )
-
-                prompt_embeds = text_encoder(
-                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds[-2]
-                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-                prompt_embeds_list.append(prompt_embeds)
-
-            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
-        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
-            negative_prompt_embeds = np.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            negative_prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-
-                negative_prompt_embeds = text_encoder(
-                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
-                negative_prompt_embeds = negative_prompt_embeds[-2]
-                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-            negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1)
-
-        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
-        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        strength: float,
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy()
-
-        return timesteps, num_inference_steps - t_start
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-        else:
-            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents], axis=0)
-
-        # add noise to latents using the timesteps
-        if isinstance(generator, np.random.RandomState):
-            noise = generator.randn(*init_latents.shape).astype(dtype)
-        elif isinstance(generator, torch.Generator):
-            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
-        else:
-            raise ValueError(
-                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
-                f" {type(generator)}."
-            )
-
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        )
-        init_latents = init_latents.numpy()
-
-        return init_latents
-
-    def _get_add_time_ids(
-        self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
-    ):
-        if self.config.get("requires_aesthetics_score"):
-            add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),)
-            add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),)
-        else:
-            add_time_ids = (original_size + crops_coords_top_left + target_size,)
-            add_neg_time_ids = (original_size + crops_coords_top_left + target_size,)
-
-        add_time_ids = np.array(add_time_ids, dtype=dtype)
-        add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype)
-
-        return add_time_ids, add_neg_time_ids
-
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.3,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        aesthetic_score: float = 6.0,
-        negative_aesthetic_score: float = 2.5,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`Union[np.ndarray, PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            strength (`float`, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.7):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 1. Define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random.RandomState()
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 2. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-
-        # 3. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0)
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        latents_dtype = prompt_embeds.dtype
-        image = image.astype(latents_dtype)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = {}
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        height, width = latents.shape[-2:]
-        height = height * self.vae_scale_factor
-        width = width * self.vae_scale_factor
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 8. Prepare added time ids & embeddings
-        add_text_embeds = pooled_prompt_embeds
-        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            aesthetic_score,
-            negative_aesthetic_score,
-            dtype=prompt_embeds.dtype,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
-            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
-            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
-        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                text_embeds=add_text_embeds,
-                time_ids=add_time_ids,
-            )
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    step_idx = i // getattr(self.scheduler, "order", 1)
-                    callback(step_idx, t, latents)
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        if not return_dict:
-            return (image,)
-
-        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py
deleted file mode 100644
index e9d5986b61c..00000000000
--- a/optimum/pipelines/diffusers/pipeline_utils.py
+++ /dev/null
@@ -1,282 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-
-import warnings
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import torch
-from diffusers import ConfigMixin
-from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor
-from diffusers.utils.pil_utils import PIL_INTERPOLATION
-from PIL import Image
-from tqdm.auto import tqdm
-
-
-class DiffusionPipelineMixin(ConfigMixin):
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L812
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Converts a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-
-        return pil_images
-
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827
-    def progress_bar(self, iterable=None, total=None):
-        if not hasattr(self, "_progress_bar_config"):
-            self._progress_bar_config = {}
-        elif not isinstance(self._progress_bar_config, dict):
-            raise ValueError(
-                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
-            )
-
-        if iterable is not None:
-            return tqdm(iterable, **self._progress_bar_config)
-        elif total is not None:
-            return tqdm(total=total, **self._progress_bar_config)
-        else:
-            raise ValueError("Either `total` or `iterable` has to be defined.")
-
-
-# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True)
-    std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-
-class VaeImageProcessor(DiffusersVaeImageProcessor):
-    # Adapted from diffusers.VaeImageProcessor.denormalize
-    @staticmethod
-    def denormalize(images: np.ndarray):
-        """
-        Denormalize an image array to [0,1].
-        """
-        return np.clip(images / 2 + 0.5, 0, 1)
-
-    # Adapted from diffusers.VaeImageProcessor.preprocess
-    def preprocess(
-        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> np.ndarray:
-        """
-        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
-        """
-        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
-
-        do_convert_grayscale = getattr(self.config, "do_convert_grayscale", False)
-        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
-        if do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
-            if isinstance(image, torch.Tensor):
-                # if image is a pytorch tensor could have 2 possible shapes:
-                #    1. batch x height x width: we should insert the channel dimension at position 1
-                #    2. channnel x height x width: we should insert batch dimension at position 0,
-                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
-                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
-                image = image.unsqueeze(1)
-            else:
-                # if it is a numpy array, it could have 2 possible shapes:
-                #   1. batch x height x width: insert channel dimension on last position
-                #   2. height x width x channel: insert batch dimension on first position
-                if image.shape[-1] == 1:
-                    image = np.expand_dims(image, axis=0)
-                else:
-                    image = np.expand_dims(image, axis=-1)
-
-        if isinstance(image, supported_formats):
-            image = [image]
-        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
-            raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
-            )
-
-        if isinstance(image[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                image = [self.convert_to_rgb(i) for i in image]
-            elif do_convert_grayscale:
-                image = [self.convert_to_grayscale(i) for i in image]
-            if self.config.do_resize:
-                height, width = self.get_height_width(image[0], height, width)
-                image = [self.resize(i, height, width) for i in image]
-            image = self.reshape(self.pil_to_numpy(image))
-        else:
-            if isinstance(image[0], torch.Tensor):
-                image = [self.pt_to_numpy(elem) for elem in image]
-                image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            else:
-                image = self.reshape(np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0))
-
-            if do_convert_grayscale and image.ndim == 3:
-                image = np.expand_dims(image, 1)
-
-            # don't need any preprocess if the image is latents
-            if image.shape[1] == 4:
-                return image
-
-            if self.config.do_resize:
-                height, width = self.get_height_width(image, height, width)
-                image = self.resize(image, height, width)
-
-        # expected range [0,1], normalize to [-1,1]
-        do_normalize = self.config.do_normalize
-        if image.min() < 0 and do_normalize:
-            warnings.warn(
-                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
-                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
-                FutureWarning,
-            )
-            do_normalize = False
-
-        if do_normalize:
-            image = self.normalize(image)
-
-        if getattr(self.config, "do_binarize", False):
-            image = self.binarize(image)
-
-        return image
-
-    # Adapted from diffusers.VaeImageProcessor.postprocess
-    def postprocess(
-        self,
-        image: np.ndarray,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-    ):
-        if not isinstance(image, np.ndarray):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
-            )
-        if output_type not in ["latent", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pt`, `latent`"
-            )
-            warnings.warn(deprecation_message, FutureWarning)
-            output_type = "np"
-
-        if output_type == "latent":
-            return image
-
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-
-        image = np.stack(
-            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
-        )
-
-        image = image.transpose((0, 2, 3, 1))
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        return image
-
-    def get_height_width(
-        self,
-        image: Union[PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ):
-        """
-        This function return the height and width that are downscaled to the next integer multiple of
-        `vae_scale_factor`.
-
-        Args:
-            image(`PIL.Image.Image`, `np.ndarray`):
-                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
-                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
-                have shape `[batch, channel, height, width]`.
-            height (`int`, *optional*, defaults to `None`):
-                The height in preprocessed image. If `None`, will use the height of `image` input.
-            width (`int`, *optional*`, defaults to `None`):
-                The width in preprocessed. If `None`, will use the width of the `image` input.
-        """
-        height = height or (image.height if isinstance(image, PIL.Image.Image) else image.shape[-2])
-        width = width or (image.width if isinstance(image, PIL.Image.Image) else image.shape[-1])
-        # resize to integer multiple of vae_scale_factor
-        width, height = (x - x % self.config.vae_scale_factor for x in (width, height))
-        return height, width
-
-    # Adapted from diffusers.VaeImageProcessor.numpy_to_pt
-    @staticmethod
-    def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor:
-        """
-        Convert a NumPy image to a PyTorch tensor.
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-
-        images = torch.from_numpy(images)
-        return images
-
-    # Adapted from diffusers.VaeImageProcessor.pt_to_numpy
-    @staticmethod
-    def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray:
-        """
-        Convert a PyTorch tensor to a NumPy image.
-        """
-        images = images.cpu().float().numpy()
-        return images
-
-    @staticmethod
-    def reshape(images: np.ndarray) -> np.ndarray:
-        """
-        Reshape inputs to expected shape.
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-
-        return images.transpose(0, 3, 1, 2)
-
-    # TODO : remove after diffusers v0.21.0 release
-    def resize(
-        self,
-        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
-        """
-        Resize image.
-        """
-        if isinstance(image, PIL.Image.Image):
-            image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
-        elif isinstance(image, torch.Tensor):
-            image = torch.nn.functional.interpolate(image, size=(height, width))
-        elif isinstance(image, np.ndarray):
-            image = self.numpy_to_pt(image)
-            image = torch.nn.functional.interpolate(image, size=(height, width))
-            image = self.pt_to_numpy(image)
-        return image
diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py
deleted file mode 100644
index b3cd622edac..00000000000
--- a/optimum/pipelines/diffusers/watermark.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import numpy as np
-from imwatermark import WatermarkEncoder
-
-
-WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
-WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
-
-
-# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12
-class StableDiffusionXLWatermarker:
-    def __init__(self):
-        self.watermark = WATERMARK_BITS
-        self.encoder = WatermarkEncoder()
-        self.encoder.set_watermark("bits", self.watermark)
-
-    def apply_watermark(self, images: np.array):
-        # can't encode images that are smaller than 256
-        if images.shape[-1] < 256:
-            return images
-
-        # cv2 doesn't support float16
-        if images.dtype == np.float16:
-            images = images.astype(np.float32)
-
-        images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1))
-
-        images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2))
-
-        np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images)
-
-        return images
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index d1471aa218a..7671d6cd2e6 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -43,7 +43,7 @@
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
 from optimum.exporters.onnx.utils import get_speecht5_models_for_export
-from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig
+from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig
 from optimum.utils.save_utils import maybe_load_preprocessors
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
@@ -292,27 +292,22 @@ def _onnx_export(
 
                 gc.collect()
 
-    def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"):
+    def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device="cpu"):
         pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device)
         models_and_onnx_configs = get_diffusion_models_for_export(pipeline)
-        output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
-        model, _ = models_and_onnx_configs["vae_encoder"]
-        model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters}
 
         with TemporaryDirectory() as tmpdirname:
             _, onnx_outputs = export_models(
                 models_and_onnx_configs=models_and_onnx_configs,
                 opset=14,
                 output_dir=Path(tmpdirname),
-                output_names=output_names,
                 device=device,
             )
             validate_models_outputs(
                 models_and_onnx_configs=models_and_onnx_configs,
                 onnx_named_outputs=onnx_outputs,
                 output_dir=Path(tmpdirname),
-                atol=1e-3,
-                onnx_files_subpaths=output_names,
+                atol=1e-4,
                 use_subprocess=False,
             )
 
@@ -403,7 +398,7 @@ def test_tensorflow_export(
     @require_vision
     @require_diffusers
     def test_pytorch_export_for_diffusion_models(self, model_type, model_name):
-        self._onnx_export_sd(model_type, model_name)
+        self._onnx_export_diffusion_models(model_type, model_name)
 
     @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch
@@ -414,7 +409,7 @@ def test_pytorch_export_for_diffusion_models(self, model_type, model_name):
     @pytest.mark.run_slow
     @pytest.mark.gpu_test
     def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name):
-        self._onnx_export_sd(model_type, model_name, device="cuda")
+        self._onnx_export_diffusion_models(model_type, model_name, device="cuda")
 
 
 class CustomWhisperOnnxConfig(WhisperOnnxConfig):
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index 9f480b2d1a0..956566f0e1f 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -12,10 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
 
 import numpy as np
-import PIL
 import pytest
 import torch
 from diffusers import (
@@ -24,6 +22,7 @@
     AutoPipelineForText2Image,
     DiffusionPipeline,
 )
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.utils import load_image
 from parameterized import parameterized
 from transformers.testing_utils import require_torch_gpu
@@ -35,8 +34,7 @@
     ORTPipelineForInpainting,
     ORTPipelineForText2Image,
 )
-from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
-from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
+from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
 
 def get_generator(framework, seed):
@@ -72,16 +70,8 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type=
     return [image] * batch_size
 
 
-def to_np(image):
-    if isinstance(image[0], PIL.Image.Image):
-        return np.stack([np.array(i) for i in image], axis=0)
-    elif isinstance(image, torch.Tensor):
-        return image.cpu().numpy().transpose(0, 2, 3, 1)
-    return image
-
-
 class ORTPipelineForText2ImageTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     ORTMODEL_CLASS = ORTPipelineForText2Image
     AUTOMODEL_CLASS = AutoPipelineForText2Image
@@ -126,17 +116,16 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str):
     def test_num_images_per_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
 
-        height, width, batch_size = 64, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        for num_images in [1, 3]:
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -150,61 +139,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        if model_arch == "latent-consistency":
-            # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step
-            # TODO: Investigate why this is the case
-            inputs["num_inference_steps"] = 1
-
-        for output_type in ["latent", "np"]:
+        for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
 
-            ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
-            diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
-
-            self.assertTrue(
-                np.allclose(ort_output, diffusers_output, atol=1e-4),
-                np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
-            )
-            self.assertEqual(ort_pipeline.device, diffusers_pipeline.device)
+            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 64, 32, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -220,7 +161,7 @@ def __init__(self):
                 self.has_been_called = False
                 self.number_of_steps = 0
 
-            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+            def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
 
@@ -243,17 +184,21 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
     def test_shape(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        height, width, batch_size = 128, 64, 1
+
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        height, width, batch_size = 128, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-        for output_type in ["np", "pil", "latent"]:
+        for output_type in ["pil", "np", "pt", "latent"]:
             inputs["output_type"] = output_type
             outputs = pipeline(**inputs).images
             if output_type == "pil":
                 self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
             elif output_type == "np":
                 self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            elif output_type == "pt":
+                self.assertEqual(outputs.shape, (batch_size, 3, height, width))
             else:
                 self.assertEqual(
                     outputs.shape,
@@ -263,9 +208,6 @@ def test_shape(self, model_arch: str):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
-        if model_arch in ["latent-consistency"]:
-            pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs")
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
@@ -279,14 +221,11 @@ def test_image_reproducibility(self, model_arch: str):
             ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
             ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
 
-            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
             self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_negative_prompt(self, model_arch: str):
-        if model_arch in ["latent-consistency"]:
-            pytest.skip("Latent Consistency Model (LCM) does not support negative prompts")
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
@@ -295,9 +234,8 @@ def test_negative_prompt(self, model_arch: str):
 
         negative_prompt = ["This is a negative prompt"]
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        image_slice_1 = pipeline(
-            **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED)
-        ).images[0, -3:, -3:, -1]
+
+        images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images
         prompt = inputs.pop("prompt")
 
         if model_arch == "stable-diffusion-xl":
@@ -306,39 +244,96 @@ def test_negative_prompt(self, model_arch: str):
                 inputs["negative_prompt_embeds"],
                 inputs["pooled_prompt_embeds"],
                 inputs["negative_pooled_prompt_embeds"],
-            ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt)
+            ) = pipeline.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=1,
+                device=torch.device("cpu"),
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
         else:
-            text_ids = pipeline.tokenizer(
-                prompt,
-                max_length=pipeline.tokenizer.model_max_length,
-                padding="max_length",
-                return_tensors="np",
-                truncation=True,
-            ).input_ids
-            negative_text_ids = pipeline.tokenizer(
-                negative_prompt,
-                max_length=pipeline.tokenizer.model_max_length,
-                padding="max_length",
-                return_tensors="np",
-                truncation=True,
-            ).input_ids
-            inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0]
-            inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0]
-
-        image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1]
-
-        self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1))
+            inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=1,
+                device=torch.device("cpu"),
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+
+        images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+
+        outputs = pipeline(**inputs).images
+
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
 
 class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     AUTOMODEL_CLASS = AutoPipelineForImage2Image
     ORTMODEL_CLASS = ORTPipelineForImage2Image
 
     TASK = "image-to-image"
 
-    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"):
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
         inputs = _generate_prompts(batch_size=batch_size)
 
         inputs["image"] = _generate_images(
@@ -369,11 +364,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str):
 
         self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
 
-        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
-
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
@@ -381,68 +371,18 @@ def test_num_images_per_prompt(self, model_arch: str):
         self._setup(model_args)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
-
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
-        if model_arch in ["stable-diffusion"]:
-            pytest.skip(
-                "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)"
-            )
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
@@ -455,7 +395,7 @@ def __init__(self):
                 self.has_been_called = False
                 self.number_of_steps = 0
 
-            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+            def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
 
@@ -478,18 +418,21 @@ def test_shape(self, model_arch: str):
         self._setup(model_args)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        height, width, batch_size = 32, 64, 1
 
-        for input_type in ["np", "pil", "pt"]:
+        height, width, batch_size = 128, 64, 1
+
+        for input_type in ["pil", "np", "pt"]:
             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
 
-            for output_type in ["np", "pil", "latent"]:
+            for output_type in ["pil", "np", "pt", "latent"]:
                 inputs["output_type"] = output_type
                 outputs = pipeline(**inputs).images
                 if output_type == "pil":
                     self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
                 elif output_type == "np":
                     self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
                 else:
                     self.assertEqual(
                         outputs.shape,
@@ -499,27 +442,26 @@ def test_shape(self, model_arch: str):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
-        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
         height, width, batch_size = 128, 128, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
 
-        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
 
-        self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2))
+            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
-        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
@@ -533,12 +475,73 @@ def test_image_reproducibility(self, model_arch: str):
             ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
             ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
 
-            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
             self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device.type, "cuda")
+
+        outputs = pipeline(**inputs).images
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
 
 class ORTPipelineForInpaintingTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
 
     AUTOMODEL_CLASS = AutoPipelineForInpainting
     ORTMODEL_CLASS = ORTPipelineForInpainting
@@ -546,18 +549,16 @@ class ORTPipelineForInpaintingTest(ORTModelTestMixin):
     TASK = "inpainting"
 
     def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
-        assert batch_size == 1, "Inpainting models only support batch_size=1"
-        assert input_type == "pil", "Inpainting models only support input_type='pil'"
-
         inputs = _generate_prompts(batch_size=batch_size)
 
         inputs["image"] = _generate_images(
-            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
-        )[0]
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
         inputs["mask_image"] = _generate_images(
-            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
-        )[0]
+            height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type
+        )
 
+        inputs["strength"] = 0.75
         inputs["height"] = height
         inputs["width"] = width
 
@@ -583,11 +584,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str):
 
         self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
 
-        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
-
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
@@ -595,59 +591,14 @@ def test_num_images_per_prompt(self, model_arch: str):
         self._setup(model_args)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
 
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -664,7 +615,7 @@ def __init__(self):
                 self.has_been_called = False
                 self.number_of_steps = 0
 
-            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+            def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
 
@@ -687,18 +638,21 @@ def test_shape(self, model_arch: str):
         self._setup(model_args)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        height, width, batch_size = 32, 64, 1
 
-        for input_type in ["pil"]:
+        height, width, batch_size = 128, 64, 1
+
+        for input_type in ["pil", "np", "pt"]:
             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
 
-            for output_type in ["np", "pil", "latent"]:
+            for output_type in ["pil", "np", "pt", "latent"]:
                 inputs["output_type"] = output_type
                 outputs = pipeline(**inputs).images
                 if output_type == "pil":
                     self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
                 elif output_type == "np":
                     self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
                 else:
                     self.assertEqual(
                         outputs.shape,
@@ -708,11 +662,6 @@ def test_shape(self, model_arch: str):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
-        if model_arch in ["stable-diffusion"]:
-            pytest.skip(
-                "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant"
-            )
-
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
@@ -722,23 +671,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-        latents_shape = (
-            batch_size,
-            ort_pipeline.vae_decoder.config["latent_channels"],
-            height // ort_pipeline.vae_scale_factor,
-            width // ort_pipeline.vae_scale_factor,
-        )
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
 
-        np_latents = np.random.rand(*latents_shape).astype(np.float32)
-        torch_latents = torch.from_numpy(np_latents)
+            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-        ort_output = ort_pipeline(**inputs, latents=np_latents).images
-        diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images
-
-        self.assertTrue(
-            np.allclose(ort_output, diffusers_output, atol=1e-4),
-            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
-        )
+            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -756,38 +695,65 @@ def test_image_reproducibility(self, model_arch: str):
             ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
             ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
 
-            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
             self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device, "cuda")
+
+        outputs = pipeline(**inputs).images
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
 
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
 
-class ImageProcessorTest(unittest.TestCase):
-    def test_vae_image_processor_pt(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt"))
-        input_np = to_np(input_pt)
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_np = np.stack(_generate_images(height=8, width=8, input_type="np"))
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil")
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
-                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index f6771ce7618..665f253c480 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -148,7 +148,7 @@ def __init__(self, *args, **kwargs):
         self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small"
         self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro"
         self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx"
-        self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
+        self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "optimum-internal-testing/tiny-stable-diffusion-onnx"
 
     def test_load_model_from_local_path(self):
         model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH)
@@ -222,17 +222,17 @@ def test_load_seq2seq_model_from_empty_cache(self):
     @require_diffusers
     def test_load_stable_diffusion_model_from_cache(self):
         _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
-
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
         )
-
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
         self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder)
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
     @require_diffusers
     def test_load_stable_diffusion_model_from_empty_cache(self):
         dirpath = os.path.join(
@@ -325,6 +325,8 @@ def test_load_stable_diffusion_model_from_hub(self):
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
     @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -339,6 +341,8 @@ def test_load_stable_diffusion_model_cuda_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
     @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
@@ -354,6 +358,8 @@ def test_load_stable_diffusion_model_rocm_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
     @require_diffusers
     def test_load_stable_diffusion_model_cpu_provider(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
@@ -366,6 +372,8 @@ def test_load_stable_diffusion_model_cpu_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cpu"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
     @require_diffusers
     def test_load_stable_diffusion_model_unknown_provider(self):
         with self.assertRaises(ValueError):
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 17f3b391b04..5071d0081af 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -171,6 +171,11 @@ class ORTModelTestMixin(unittest.TestCase):
         "np": np.ndarray,
     }
 
+    TASK = None
+
+    ORTMODEL_CLASS = None
+    AUTOMODEL_CLASS = None
+
     @classmethod
     def setUpClass(cls):
         cls.onnx_model_dirs = {}

From 2c0476eda1398b9a81cb966c817a460ed6e53413 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 9 Oct 2024 18:49:38 +0200
Subject: [PATCH 12/50] Enable ONNX export for transformers 4.45 (#2045)

* Enable ONNX export for transformers 4.45

* add comment

* update setup
---
 optimum/exporters/onnx/convert.py | 11 +++++------
 setup.py                          |  7 ++-----
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index f2bf95f3e3c..d72fd7eb21a 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -26,7 +26,6 @@
 
 import numpy as np
 import onnx
-import transformers
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -531,6 +530,11 @@ def export_pytorch(
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     FORCE_ONNX_EXTERNAL_DATA = os.getenv("FORCE_ONNX_EXTERNAL_DATA", "0") == "1"
 
+    model_kwargs = model_kwargs or {}
+    # num_logits_to_keep was added in transformers 4.45 and isn't added as inputs when exporting the model
+    if check_if_transformers_greater("4.44.99") and "num_logits_to_keep" in signature(model.forward).parameters.keys():
+        model_kwargs["num_logits_to_keep"] = 0
+
     with torch.no_grad():
         model.config.return_dict = True
         model = model.eval()
@@ -1001,11 +1005,6 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
-    if check_if_transformers_greater("4.44.99"):
-        raise ImportError(
-            f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}"
-        )
-
     TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type"):
diff --git a/setup.py b/setup.py
index 0e2f0fd1bb6..63f202faa6e 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,6 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
-        "transformers<4.45.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -63,10 +62,9 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
-        "transformers<4.45.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"],
+    "exporters": ["onnx", "onnxruntime", "timm"],
+    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -77,7 +75,6 @@
         "numpy<1.24.0",
         "datasets<=2.16",
         "transformers[sentencepiece]>=4.26,<4.38",
-        "transformers<4.45.0",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",

From 1b5a63da593599b1e6e178754146e0109d3305d9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:52:35 +0200
Subject: [PATCH 13/50] Remove the need for the config to be in the subfolder
 (#2044)

* remove the need for the config to be in the subfolder

* fix

* fix for offline mode

* add log

* fix

* enable load local model in subfolder

* fix windows
---
 optimum/modeling_base.py            | 36 ++++++++++++++++++-----------
 optimum/onnxruntime/modeling_ort.py |  6 ++---
 tests/onnxruntime/test_modeling.py  | 15 ++++++++++++
 3 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 29521b7c0c6..48c738514ae 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -380,27 +380,35 @@ def from_pretrained(
                 )
             model_id, revision = model_id.split("@")
 
+        all_files, _ = TasksManager.get_model_files(
+            model_id,
+            subfolder=subfolder,
+            cache_dir=cache_dir,
+            revision=revision,
+            token=token,
+        )
+
+        config_folder = subfolder
+        if cls.config_name not in all_files:
+            logger.info(
+                f"{cls.config_name} not found in the specified subfolder {subfolder}. Using the top level {cls.config_name}."
+            )
+            config_folder = ""
+
         library_name = TasksManager.infer_library_from_model(
-            model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token
         )
 
         if library_name == "timm":
             config = PretrainedConfig.from_pretrained(
-                model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+                model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token
             )
 
         if config is None:
-            if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
-                if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)):
-                    config = AutoConfig.from_pretrained(
-                        os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code
-                    )
-                elif CONFIG_NAME in os.listdir(model_id):
+            if os.path.isdir(os.path.join(model_id, config_folder)) and cls.config_name == CONFIG_NAME:
+                if CONFIG_NAME in os.listdir(os.path.join(model_id, config_folder)):
                     config = AutoConfig.from_pretrained(
-                        os.path.join(model_id, CONFIG_NAME), trust_remote_code=trust_remote_code
-                    )
-                    logger.info(
-                        f"config.json not found in the specified subfolder {subfolder}. Using the top level config.json."
+                        os.path.join(model_id, config_folder), trust_remote_code=trust_remote_code
                     )
                 else:
                     raise OSError(f"config.json not found in {model_id} local folder")
@@ -411,7 +419,7 @@ def from_pretrained(
                     cache_dir=cache_dir,
                     token=token,
                     force_download=force_download,
-                    subfolder=subfolder,
+                    subfolder=config_folder,
                     trust_remote_code=trust_remote_code,
                 )
         elif isinstance(config, (str, os.PathLike)):
@@ -421,7 +429,7 @@ def from_pretrained(
                 cache_dir=cache_dir,
                 token=token,
                 force_download=force_download,
-                subfolder=subfolder,
+                subfolder=config_folder,
                 trust_remote_code=trust_remote_code,
             )
 
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 9b29afa566b..ce1d68536ac 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -510,13 +510,12 @@ def _from_pretrained(
 
         if file_name is None:
             if model_path.is_dir():
-                onnx_files = list(model_path.glob("*.onnx"))
+                onnx_files = list((model_path / subfolder).glob("*.onnx"))
             else:
                 repo_files, _ = TasksManager.get_model_files(
                     model_id, revision=revision, cache_dir=cache_dir, token=token
                 )
                 repo_files = map(Path, repo_files)
-
                 pattern = "*.onnx" if subfolder == "" else f"{subfolder}/*.onnx"
                 onnx_files = [p for p in repo_files if p.match(pattern)]
 
@@ -983,10 +982,9 @@ def _cached_file(
             token = use_auth_token
 
         model_path = Path(model_path)
-
         # locates a file in a local folder and repo, downloads and cache it if necessary.
         if model_path.is_dir():
-            model_cache_path = model_path / file_name
+            model_cache_path = model_path / subfolder / file_name
             preprocessors = maybe_load_preprocessors(model_path.as_posix())
         else:
             model_cache_path = hf_hub_download(
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 665f253c480..501c7dac240 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -28,6 +28,7 @@
 import requests
 import timm
 import torch
+from huggingface_hub import HfApi
 from huggingface_hub.constants import default_cache_path
 from parameterized import parameterized
 from PIL import Image
@@ -1263,6 +1264,20 @@ def test_trust_remote_code(self):
             torch.allclose(pt_logits, ort_logits, atol=1e-4), f" Maxdiff: {torch.abs(pt_logits - ort_logits).max()}"
         )
 
+    @parameterized.expand(("", "onnx"))
+    def test_loading_with_config_not_from_subfolder(self, subfolder):
+        # config.json file in the root directory and not in the subfolder
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-onnx"
+        # hub model
+        ORTModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=subfolder == "")
+        # local model
+        api = HfApi()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            ORTModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=subfolder == "")
+            remove_directory(tmpdirname)
+
 
 class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin):
     SUPPORTED_ARCHITECTURES = [

From 851f04b13aab9f17f1c3b5080767a2cc440bb2b1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:54:01 +0200
Subject: [PATCH 14/50] Remove upper transformers version limit (#2048)

---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 63f202faa6e..fb290274a3b 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29,<4.46.0",
+    "transformers[sentencepiece]>=4.29",
     "torch>=1.11",
     "packaging",
     "numpy",
@@ -54,6 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
+        "transformers<4.46.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -62,9 +63,10 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
+        "transformers<4.46.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
+    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"],
+    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",

From 4ce73646eb13dff14503092eeb92f94d6a1ee7b1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:57:31 +0200
Subject: [PATCH 15/50] Dev version (#2049)

---
 optimum/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/version.py b/optimum/version.py
index 4a8a7edab63..4fff28e5c97 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.23.0.dev0"
+__version__ = "1.24.0.dev0"

From 6172e96914d6f49aec253db05c98d827c158caab Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 11 Oct 2024 10:24:00 +0200
Subject: [PATCH 16/50] Fix doc build (#2050)

* Fix doc build

* Trigger PR doc build when the PR doc build workflow is modified

* Fix issue with torch-xla and ubuntu-latest
---
 .github/workflows/build_main_documentation.yml | 8 ++++++--
 .github/workflows/build_pr_documentation.yml   | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index 20face917ab..11e36ed57f3 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v2
@@ -66,7 +66,7 @@ jobs:
           sudo apt-get purge -y '^mysql.*'
           sudo apt-get purge -y '^java.*'
           sudo apt-get purge -y '^openjdk.*'
-          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel
           df -h
           sudo apt-get autoremove -y >/dev/null 2>&1
           sudo apt-get clean
@@ -110,6 +110,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -135,6 +137,7 @@ jobs:
 
       - name: Make Furiosa documentation
         run: |
+          source venv-doc/bin/activate
           cd optimum-furiosa
           pip install .
           sudo apt install software-properties-common
@@ -159,6 +162,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index e5f2dcb0d18..6eb09aff304 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -8,6 +8,7 @@ on:
       - "optimum/**.py"
       - "docs/**.mdx"
       - "docs/**.yml"
+      - ".github/workflows/build_pr_documentation.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,7 +16,7 @@ concurrency:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
       PR_NUMBER: ${{ github.event.number }}
@@ -60,6 +61,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -99,6 +102,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html

From eb6f5de5ce3eb69f73d7a0ee0da30f9bd8ca2a08 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Fri, 11 Oct 2024 10:27:47 +0200
Subject: [PATCH 17/50] Don't hardcode the logger level to INFO; let users set
 TRANSFORMERS_VERBOSITY (#2047)

And keep the default as Warning, i.e. the expected for Python modules
---
 optimum/exporters/onnx/__main__.py    | 1 -
 optimum/exporters/tflite/__main__.py  | 1 -
 optimum/onnx/transformations_utils.py | 1 -
 3 files changed, 3 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 703e98df3e2..6a2cc6834a6 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -43,7 +43,6 @@
     from .base import OnnxConfig
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def main_export(
diff --git a/optimum/exporters/tflite/__main__.py b/optimum/exporters/tflite/__main__.py
index b3c90cb63f2..0c4c7b994fa 100644
--- a/optimum/exporters/tflite/__main__.py
+++ b/optimum/exporters/tflite/__main__.py
@@ -28,7 +28,6 @@
 
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def main():
diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py
index 1f0765112e8..fe55a5a5770 100644
--- a/optimum/onnx/transformations_utils.py
+++ b/optimum/onnx/transformations_utils.py
@@ -29,7 +29,6 @@
 
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def _find_duplicate_initializers(

From 690d35b1ab31f375f5a4b74bf6eba37517656c05 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 11 Oct 2024 10:37:45 +0200
Subject: [PATCH 18/50] Add workflow to mark issues as stale (#2051)

---
 .github/workflows/stale.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .github/workflows/stale.yml

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 00000000000..a5e50a795b6
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5

From b42db7ee6b5fa43e41adcbd501a3bd183b589991 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 11 Oct 2024 11:04:50 +0200
Subject: [PATCH 19/50] Fix onnx export CLI for transformers >= 4.45 (#2053)

* fix onnx export

* add test
---
 optimum/exporters/onnx/convert.py               | 3 ++-
 tests/exporters/onnx/test_exporters_onnx_cli.py | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index d72fd7eb21a..565183b38fc 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import onnx
+from transformers.generation import GenerationMixin
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -1127,7 +1128,7 @@ def onnx_export_from_model(
 
         if check_if_transformers_greater("4.44.99"):
             misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
-            if model.can_generate() and len(misplaced_generation_parameters) > 0:
+            if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
                     f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index ed611ade04e..8b186e9307b 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -602,6 +602,14 @@ def test_diffusion(self):
                 check=True,
             )
 
+    def test_sentence_transformers(self):
+        with TemporaryDirectory() as tmpdirname:
+            subprocess.run(
+                f"python3 -m optimum.exporters.onnx --model sentence-transformers-testing/stsb-bert-tiny-onnx --task feature-extraction {tmpdirname}",
+                shell=True,
+                check=True,
+            )
+
     def test_legacy(self):
         with TemporaryDirectory() as tmpdirname:
             subprocess.run(

From 94201540ac41b0a86042b04df0a3b374793761b8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 11 Oct 2024 12:18:30 +0200
Subject: [PATCH 20/50] Fix onnx export for transformers>=4.45 (#2054)

* fix onnx export for transformers>=4.45

* fix tets

* style
---
 optimum/exporters/onnx/convert.py               | 6 +++++-
 tests/exporters/onnx/test_exporters_onnx_cli.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 565183b38fc..2661d835979 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -1128,7 +1128,11 @@ def onnx_export_from_model(
 
         if check_if_transformers_greater("4.44.99"):
             misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
-            if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
+            if (
+                isinstance(model, GenerationMixin)
+                and model.can_generate()
+                and len(misplaced_generation_parameters) > 0
+            ):
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
                     f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index 8b186e9307b..9ac7832aa7d 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -602,6 +602,7 @@ def test_diffusion(self):
                 check=True,
             )
 
+    @require_sentence_transformers
     def test_sentence_transformers(self):
         with TemporaryDirectory() as tmpdirname:
             subprocess.run(

From 0d808ade96b01e35e5c8a38b0b156ce4b241f433 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 11 Oct 2024 16:23:17 +0200
Subject: [PATCH 21/50] Upgrade macOS image for tests compatibility with numpy
 v2 (#2055)

* update runner environment

* fix

* downgrade

* Update .github/workflows/test_bettertransformer.yml

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Update .github/workflows/test_bettertransformer.yml

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* fix

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 .github/workflows/test_bettertransformer.yml | 5 ++---
 .github/workflows/test_onnx.yml              | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
index 080d8272dfc..b023fa4bd1b 100644
--- a/.github/workflows/test_bettertransformer.yml
+++ b/.github/workflows/test_bettertransformer.yml
@@ -15,9 +15,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
-        exclude: [{ python-version: 3.8, os: macos-13 }]
+        python-version: [3.9]
+        os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 9aa8b307235..22a11720798 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
+        os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
     steps:

From 058593927303ec94726c15a0cebf4da96ee628ec Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 14 Oct 2024 10:45:17 +0200
Subject: [PATCH 22/50] Fix main doc build (#2057)

---
 .github/workflows/build_main_documentation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index 11e36ed57f3..efd61c1fd4f 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -57,6 +57,7 @@ jobs:
       - name: Free disk space
         run: |
           df -h
+          sudo apt-get update
           sudo apt-get purge -y '^apache.*'
           sudo apt-get purge -y '^imagemagick.*'
           sudo apt-get purge -y '^dotnet.*'

From 9fd9ca5505e83f67c2a5e4c4f6f56d5fcf28442f Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:54:15 +0200
Subject: [PATCH 23/50] Enter venv before pushing doc in main doc build
 workflow (#2058)

---
 .github/workflows/build_main_documentation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index efd61c1fd4f..c922f5097da 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -197,6 +197,7 @@ jobs:
 
       - name: Push to repositories
         run: |
+          source venv-doc/bin/activate
           cd optimum/optimum-doc-build
           sudo chmod -R ugo+rwx optimum
           doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml

From d11e2850158694258875b6e4fcba1c5db61af42a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 15 Oct 2024 11:12:05 +0200
Subject: [PATCH 24/50] Fix tests expected environment variable name (#2059)

* fix env variable name

* fix test

* comment

* load onnx revision
---
 .github/workflows/test_onnxruntime.yml | 2 +-
 tests/onnxruntime/test_modeling.py     | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 291a3b08335..a72bedb1ab7 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -51,7 +51,7 @@ jobs:
 
       - name: Test with pytest (in parallel)
         env:
-          FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
         working-directory: tests
         run: |
           pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 501c7dac240..33243da278a 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -974,9 +974,13 @@ def test_load_model_from_hub_private(self):
         token = os.environ.get("HF_HUB_READ_TOKEN", None)
 
         if token is None:
-            self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.")
+            self.skipTest(
+                "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`."
+            )
 
-        model = ORTModelForCustomTasks.from_pretrained("optimum-internal-testing/tiny-random-phi-private", token=token)
+        model = ORTModelForCustomTasks.from_pretrained(
+            "optimum-internal-testing/tiny-random-phi-private", revision="onnx", token=token
+        )
 
         self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)

From 1e5014e70f17e0437c4b0a7f4e65e170688d8ab0 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 15 Oct 2024 11:12:17 +0200
Subject: [PATCH 25/50] Remove unused HF_TOKEN environment variable (#2061)

remove unused HF_TOKEN environment variable
---
 .github/workflows/test_fx_automatic_parallel.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml
index 05ebf7ea9e5..c5d82be38b3 100644
--- a/.github/workflows/test_fx_automatic_parallel.yml
+++ b/.github/workflows/test_fx_automatic_parallel.yml
@@ -35,7 +35,6 @@ jobs:
       options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
       env:
         NCCL_DEBUG: INFO
-        HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
     defaults:
       run:
         shell: bash

From 8e54205b3b6b45f10f6360c05bb3a560a27354fe Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:06:48 +0200
Subject: [PATCH 26/50] Fix compatibility with diffusers < 0.25.0 (#2063)

* Fix compatibility with diffusers < 0.25.0

* fix import
---
 optimum/onnxruntime/modeling_diffusion.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 87fcb68c7e9..3899a7b36b6 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -26,7 +26,6 @@
 import numpy as np
 import torch
 from diffusers.configuration_utils import ConfigMixin
-from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 from diffusers.pipelines import (
     AutoPipelineForImage2Image,
     AutoPipelineForInpainting,
@@ -52,6 +51,7 @@
 from transformers.modeling_outputs import ModelOutput
 
 import onnxruntime as ort
+from optimum.utils import check_if_diffusers_greater
 
 from ..exporters.onnx import main_export
 from ..onnx.utils import _get_model_external_data_paths
@@ -73,6 +73,12 @@
 )
 
 
+if check_if_diffusers_greater("0.25.0"):
+    from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+else:
+    from diffusers.models.vae import DiagonalGaussianDistribution
+
+
 logger = logging.getLogger(__name__)
 
 

From 59d6f7f04e390fb13fcba62bf22cea6ff2030623 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:59:22 +0200
Subject: [PATCH 27/50] Clean up ORT documentation (#2065)

* refactor ort doc

* fix links

* fix
---
 .../package_reference/modeling_ort.mdx        |   5 +
 .../onnxruntime/usage_guides/models.mdx       | 275 +++---------------
 2 files changed, 48 insertions(+), 232 deletions(-)

diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
index 65b2b60195a..2c93ab3ac0d 100644
--- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx
+++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks.
 
 ## Stable Diffusion
 
+#### ORTDiffusionPipeline
+
+[[autodoc]] onnxruntime.ORTDiffusionPipeline
+    - __call__
+
 #### ORTStableDiffusionPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionPipeline
diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 131822e9568..1292e755c06 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -4,263 +4,74 @@ Optimum is a utility package for building and running inference with accelerated
 Optimum can be used to load optimized models from the [Hugging Face Hub](hf.co/models) and create pipelines
 to run accelerated inference without rewriting your APIs.
 
-## Switching from Transformers to Optimum
 
-The `optimum.onnxruntime.ORTModelForXXX` model classes are API compatible with Hugging Face Transformers models. This
-means you can just replace your `AutoModelForXXX` class with the corresponding `ORTModelForXXX` class in `optimum.onnxruntime`.
+## Loading
 
-You do not need to adapt your code to get it to work with `ORTModelForXXX` classes:
+### Transformers models
 
-```diff
-from transformers import AutoTokenizer, pipeline
--from transformers import AutoModelForQuestionAnswering
-+from optimum.onnxruntime import ORTModelForQuestionAnswering
-
--model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") # PyTorch checkpoint
-+model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") # ONNX checkpoint
-tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
-
-onnx_qa = pipeline("question-answering",model=model,tokenizer=tokenizer)
-
-question = "What's my name?"
-context = "My name is Philipp and I live in Nuremberg."
-pred = onnx_qa(question, context)
-```
-
-### Loading a vanilla Transformers model
-
-Because the model you want to work with might not be already converted to ONNX,  [`~optimum.onnxruntime.ORTModel`]
-includes a method to convert vanilla Transformers models to ONNX ones. Simply pass `export=True` to the
-[`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly:
-
-```python
->>> from optimum.onnxruntime import ORTModelForSequenceClassification
-
->>> # Load the model from the hub and export it to the ONNX format
->>> model = ORTModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased-finetuned-sst-2-english", export=True
-... )
-```
-
-### Pushing ONNX models to the Hugging Face Hub
-
-It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to push your `ORTModelForXXX` to the
-[Hugging Face Model Hub](https://hf.co/models):
-
-```python
->>> from optimum.onnxruntime import ORTModelForSequenceClassification
-
->>> # Load the model from the hub and export it to the ONNX format
->>> model = ORTModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased-finetuned-sst-2-english", export=True
-... )
-
->>> # Save the converted model
->>> model.save_pretrained("a_local_path_for_convert_onnx_model")
-
-# Push the onnx model to HF Hub
->>> model.push_to_hub(  # doctest: +SKIP
-...   "a_local_path_for_convert_onnx_model", repository_id="my-onnx-repo", use_auth_token=True
-... )
-```
-
-## Sequence-to-sequence models
-
-Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models
-are exported to the ONNX format, they are decomposed into three parts that are later combined during inference:
-- The encoder part of the model
-- The decoder part of the model + the language modeling head
-- The same decoder part of the model + language modeling head but taking and using pre-computed key / values as inputs and
-outputs. This makes inference faster.
-
-Here is an example of how you can load a T5 model to the ONNX format and run inference for a translation task:
-
-
-```python
->>> from transformers import AutoTokenizer, pipeline
->>> from optimum.onnxruntime import ORTModelForSeq2SeqLM
-
-# Load the model from the hub and export it to the ONNX format
->>> model_name = "t5-small"
->>> model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-# Create a pipeline
->>> onnx_translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
->>> text = "He never went out without a book under his arm, and he often came back with two."
->>> result = onnx_translation(text)
->>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
-```
-
-## Stable Diffusion
-
-Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models
-are exported to the ONNX format, they are split into four components that are later combined during inference:
-- The text encoder
-- The U-NET
-- The VAE encoder
-- The VAE decoder
-
-Make sure you have 🤗 Diffusers installed.
-
-To install `diffusers`:
-```bash
-pip install diffusers
-```
-
-### Text-to-Image
-
-Here is an example of how you can load an ONNX Stable Diffusion model and run inference using ONNX Runtime:
-
-```python
-from optimum.onnxruntime import ORTStableDiffusionPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, revision="onnx")
-prompt = "sailing ship in storm by Leonardo da Vinci"
-image = pipeline(prompt).images[0]
-```
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`.
 
-To load your PyTorch model and convert it to ONNX on-the-fly, you can set `export=True`.
-
-```python
-pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
-
-# Don't forget to save the ONNX model
-save_directory = "a_local_path"
-pipeline.save_pretrained(save_directory)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/onnxruntime/stable_diffusion_v1_5_ort_sail_boat.png">
-</div>
-
-### Image-to-Image
-
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-from optimum.onnxruntime import ORTStableDiffusionImg2ImgPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = ORTStableDiffusionImg2ImgPipeline.from_pretrained(model_id, revision="onnx")
-
-url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((768, 512))
-
-prompt = "A fantasy landscape, trending on artstation"
-
-image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-image.save("fantasy_landscape.png")
-```
-
-### Inpaint
-
-```python
-import PIL
-import requests
-import torch
-from io import BytesIO
-from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline
-
-model_id = "runwayml/stable-diffusion-inpainting"
-pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, revision="onnx")
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+```diff
+  from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForQuestionAnswering
++ from optimum.onnxruntime import ORTModelForQuestionAnswering
 
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
+- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
++ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
+  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
 
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  result = pipe("He never went out without a book under his arm")
 ```
 
+More information for all the supported `ORTModelForXxx` in our [documentation](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)
 
-## Stable Diffusion XL
-
-Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
 
-```bash
-pip install diffusers
-pip install invisible-watermark>=0.2.0
-```
-
-### Text-to-Image
-
-Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using ONNX Runtime :
+### Diffusers models
 
-```python
-from optimum.onnxruntime import ORTStableDiffusionXLPipeline
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`.
 
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-base = ORTStableDiffusionXLPipeline.from_pretrained(model_id)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-image = base(prompt).images[0]
 
-# Don't forget to save the ONNX model
-save_directory = "sd_xl_base"
-base.save_pretrained(save_directory)
+```diff
+- from diffusers import DiffusionPipeline
++ from optimum.onnxruntime import ORTDiffusionPipeline
+
+  model_id = "runwayml/stable-diffusion-v1-5"
+- pipeline = DiffusionPipeline.from_pretrained(model_id)
++ pipeline = ORTDiffusionPipeline.from_pretrained(model_id, revision="onnx")
+  prompt = "sailing ship in storm by Leonardo da Vinci"
+  image = pipeline(prompt).images[0]
 ```
 
+## Converting your model to ONNX on-the-fly
 
-### Image-to-Image
-
-Here is an example of how you can load a PyTorch SDXL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime for *image-to-image* :
+In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly.
+Simply pass `export=True` to the [`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly:
 
 ```python
-from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
-from diffusers.utils import load_image
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-pipeline = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
 
-url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png"
-image = load_image(url).convert("RGB")
-prompt = "medieval castle by Caspar David Friedrich"
-image = pipeline(prompt, image=image).images[0]
-image.save("medieval_castle.png")
+>>> # Load the model from the hub and export it to the ONNX format
+>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)
 ```
 
 
-### Refining the image output
-
-The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model.
+## Pushing your model to the Hub
 
+You can also call `push_to_hub` directly on your model to upload it to the [Hub](https://hf.co/models).
 
 ```python
-from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-image = base(prompt=prompt, output_type="latent").images[0]
-image = refiner(prompt=prompt, image=image[None, :]).images[0]
-image.save("sailing_ship.png")
-```
-
-
-
-## Latent Consistency Models
-
-### Text-to-Image
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
 
-Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using ONNX Runtime :
+>>> # Load the model from the hub and export it to the ONNX format
+>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)
 
-```python
-from optimum.onnxruntime import ORTLatentConsistencyModelPipeline
+>>> # Save the converted model locally
+>>> output_dir = "a_local_path_for_convert_onnx_model"
+>>> model.save_pretrained(output_dir)
 
-model_id = "SimianLuo/LCM_Dreamshaper_v7"
-pipeline = ORTLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
-```
+# Push the onnx model to HF Hub
+>>> model.push_to_hub(output_dir, repository_id="my-onnx-repo")  # doctest: +SKIP
+```
\ No newline at end of file

From 8af46e53bd1321d325ea4e712e7da8aca98df49f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Sat, 19 Oct 2024 20:37:48 +0200
Subject: [PATCH 28/50] Fix ort documentation code snippet (#2070)

fix code snippet
---
 docs/source/onnxruntime/usage_guides/models.mdx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 1292e755c06..905e6632c05 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -13,11 +13,11 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op
 
 ```diff
   from transformers import AutoTokenizer, pipeline
-- from transformers import AutoModelForQuestionAnswering
-+ from optimum.onnxruntime import ORTModelForQuestionAnswering
+- from transformers import AutoModelForCausalLM
++ from optimum.onnxruntime import ORTModelForCausalLM
 
-- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
-+ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
+- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
++ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
   tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
 
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@@ -74,4 +74,4 @@ You can also call `push_to_hub` directly on your model to upload it to the [Hub]
 
 # Push the onnx model to HF Hub
 >>> model.push_to_hub(output_dir, repository_id="my-onnx-repo")  # doctest: +SKIP
-```
\ No newline at end of file
+```

From 58c3571156466c13ebc7e22d3405b376ab2d222b Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Tue, 22 Oct 2024 16:48:29 +0200
Subject: [PATCH 29/50] Update the habana extra (#2077)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fb290274a3b..822d8be1b80 100644
--- a/setup.py
+++ b/setup.py
@@ -84,7 +84,7 @@
     "nncf": "optimum-intel[nncf]>=1.18.0",
     "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
     "ipex": "optimum-intel[ipex]>=1.18.0",
-    "habana": ["optimum-habana", "transformers>=4.43.0,<4.44.0"],
+    "habana": ["optimum-habana", "transformers>=4.45.0,<4.46.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers>=4.36.2,<4.42.0"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers>=4.36.2,<4.42.0"],
     "graphcore": "optimum-graphcore",

From 2e637be5d6b3e15c2b300130599bcec0f3e12ec8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:41:10 +0200
Subject: [PATCH 30/50] Add sentence-transformers and timm documentation
 example (#2072)

* add sentence-transformers and timm example to documentation

* replace with onnx models

* rephrase
---
 .../onnxruntime/usage_guides/models.mdx       | 58 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 905e6632c05..27ac446096b 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -9,7 +9,7 @@ to run accelerated inference without rewriting your APIs.
 
 ### Transformers models
 
-Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`.
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModelForXxx` with the corresponding `ORTModelForXxx` class.
 
 ```diff
   from transformers import AutoTokenizer, pipeline
@@ -29,7 +29,7 @@ More information for all the supported `ORTModelForXxx` in our [documentation](h
 
 ### Diffusers models
 
-Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`.
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `DiffusionPipeline` with the corresponding `ORTDiffusionPipeline` class.
 
 
 ```diff
@@ -43,6 +43,60 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op
   image = pipeline(prompt).images[0]
 ```
 
+
+### Sentence Transformers models
+
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModel` with the corresponding `ORTModelForFeatureExtraction` class.
+
+```diff
+  from transformers import AutoTokenizer
+- from transformers import AutoModel
++ from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
++ model = ORTModelForFeatureExtraction.from_pretrained("optimum/all-MiniLM-L6-v2")
+  inputs = tokenizer("This is an example sentence", return_tensors="pt")
+  outputs = model(**inputs)
+```
+
+You can also load your ONNX model directly using the [`sentence_transformers.SentenceTransformer`](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx) class, just make sure to have `sentence-transformers>=3.2` installed. If the model wasn't already converted to ONNX, it will be converted automatically on-the-fly.
+
+```diff
+  from sentence_transformers import SentenceTransformer
+
+  model_id = "sentence-transformers/all-MiniLM-L6-v2"
+- model = SentenceTransformer(model_id)
++ model = SentenceTransformer(model_id, backend="onnx")
+
+  sentences = ["This is an example sentence", "Each sentence is converted"]
+  embeddings = model.encode(sentences)
+```
+
+
+### Timm models
+
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `create_model` with the corresponding `ORTModelForImageClassification` class.
+
+
+```diff
+  import requests
+  from PIL import Image
+- from timm import create_model
+  from timm.data import resolve_data_config, create_transform
++ from optimum.onnxruntime import ORTModelForImageClassification
+
+- model = create_model("timm/mobilenetv3_large_100.ra_in1k", pretrained=True)
++ model = ORTModelForImageClassification.from_pretrained("optimum/mobilenetv3_large_100.ra_in1k")
+  transform = create_transform(**resolve_data_config(model.config.pretrained_cfg, model=model))
+  url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
+  image = Image.open(requests.get(url, stream=True).raw)
+  inputs = transform(image).unsqueeze(0)
+  outputs = model(inputs)
+```
+
+
+
 ## Converting your model to ONNX on-the-fly
 
 In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly.

From 4a39ae0b1de05601c8a33f4a13c244bdd016db24 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:20:31 +0100
Subject: [PATCH 31/50] Create token type ids when not provided  (#2081)

* create token type ids when needed

* add test
---
 optimum/onnxruntime/modeling_ort.py | 19 ++++++++++++++++++-
 tests/onnxruntime/test_modeling.py  | 12 ++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index ce1d68536ac..8e5a814b689 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -931,7 +931,6 @@ def _prepare_onnx_inputs(
         self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]
     ) -> Dict[str, np.ndarray]:
         onnx_inputs = {}
-
         # converts pytorch inputs into numpy inputs for onnx
         for input_name in self.input_names.keys():
             onnx_inputs[input_name] = inputs.pop(input_name)
@@ -1086,6 +1085,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1241,6 +1243,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1330,6 +1335,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1437,6 +1445,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1527,6 +1538,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1610,6 +1624,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 33243da278a..da450b8e31c 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2192,6 +2192,18 @@ def test_compare_to_io_binding(self, model_arch):
 
         gc.collect()
 
+    def test_default_token_type_ids(self):
+        model_id = MODEL_NAMES["bert"]
+        model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("this is a simple input", return_tensors="np")
+        self.assertTrue("token_type_ids" in model.input_names)
+        token_type_ids = tokens.pop("token_type_ids")
+        outs = model(token_type_ids=token_type_ids, **tokens)
+        outs_without_token_type_ids = model(**tokens)
+        self.assertTrue(np.allclose(outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state))
+        gc.collect()
+
 
 class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin):
     # Multiple Choice tests are conducted on different models due to mismatch size in model's classifier

From 6802a0c4e9868041aa825f629c5e983df96e3cab Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:56:28 +0100
Subject: [PATCH 32/50] Add transformers 4.46 compatiblity   (#2078)

* transformers 4.46

* setup

* uupdate setup

* fix t5

* update python (3.8 eol)

* fix onnx test

* fixed deberta, onnxruntime tests in series passing

* fix bt

* fixed t5_forward for real, because it's also used by blip-2 as well

* fix Phi3

* fix opt

* vision encoder decoder

* fix setup

* style

* fix encoder decoder

* fixed transformers branch

* branch

* allow 4.47

* remove patch

* add opt

* add test

* fix OPT ONNX  export and inference

* add test

* update setup

* style

* merge tests

* update tes num beams

* add test transformers version

* add architectures depending on transformers

* add warning

* revert

* update test generation length

* style

---------

Co-authored-by: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
---
 .github/workflows/check_code_quality.yml      |   2 +-
 .github/workflows/test_benckmark.yml          |  30 +-
 .github/workflows/test_cli.yml                |   4 +-
 .github/workflows/test_export_onnx.yml        |  44 +--
 .github/workflows/test_export_onnx_cli.yml    |  30 +-
 .../workflows/test_export_onnx_cli_timm.yml   |  26 +-
 .github/workflows/test_export_onnx_timm.yml   |  27 +-
 .github/workflows/test_exporters_common.yml   |   2 +-
 .github/workflows/test_exporters_slow.yml     |   2 +-
 .github/workflows/test_fx.yml                 |   2 +-
 .github/workflows/test_offline.yml            |   2 +-
 .github/workflows/test_onnx.yml               |   2 +-
 .github/workflows/test_onnxruntime.yml        |  13 +-
 .github/workflows/test_onnxruntime_slow.yml   |   2 +-
 .github/workflows/test_optimum_common.yml     |  39 +--
 .github/workflows/test_utils.yml              |   2 +-
 optimum/bettertransformer/models/attention.py | 326 ++++++++++++------
 .../models/decoder_models.py                  |   4 +-
 optimum/bettertransformer/transformation.py   |  36 +-
 optimum/exporters/onnx/model_configs.py       |  49 ++-
 optimum/exporters/onnx/model_patcher.py       |   3 +-
 optimum/exporters/onnx/utils.py               |   6 +-
 optimum/onnxruntime/modeling_decoder.py       |   4 +-
 optimum/utils/__init__.py                     |   1 +
 optimum/utils/import_utils.py                 |  16 +
 setup.py                                      |  24 +-
 tests/bettertransformer/test_audio.py         |  20 +-
 tests/bettertransformer/test_common.py        |  12 +-
 tests/bettertransformer/test_decoder.py       |   8 +-
 tests/bettertransformer/test_encoder.py       |   4 +-
 .../bettertransformer/test_encoder_decoder.py |   2 +-
 tests/bettertransformer/test_gpu.py           |   4 +-
 tests/bettertransformer/testing_utils.py      |  18 +-
 tests/onnx/test_onnx_export_custom_module.py  |  17 +-
 tests/onnxruntime/test_modeling.py            |  61 ++--
 tests/onnxruntime/utils_onnxruntime_tests.py  |   1 +
 36 files changed, 541 insertions(+), 304 deletions(-)

diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index c429b706bff..861684cfa4d 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml
index 7f7f2ace329..e859e845d64 100644
--- a/.github/workflows/test_benckmark.yml
+++ b/.github/workflows/test_benckmark.yml
@@ -4,9 +4,9 @@ name: Benchmark suite / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,20 +17,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install wheel
-        pip install .[tests,onnxruntime,benchmark]
-    - name: Test with unittest
-      run: |
-        python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install wheel
+          pip install .[tests,onnxruntime,benchmark]
+      - name: Test with unittest
+        run: |
+          python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml
index ecb19d23aa3..2efab40aab6 100644
--- a/.github/workflows/test_cli.yml
+++ b/.github/workflows/test_cli.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
@@ -34,7 +34,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[tests,exporters,exporters-tf]
+          pip install .[tests,exporters-tf]
 
       - name: Test with pytest
         run: |
diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml
index 56ef674cb41..0cd19a1724c 100644
--- a/.github/workflows/test_export_onnx.yml
+++ b/.github/workflows/test_export_onnx.yml
@@ -2,9 +2,9 @@ name: Exporters ONNX / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,27 +15,27 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0
-    - name: Install dependencies for tensorflow export
-      run: |
-        pip install .[tests,exporters-tf]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0
+      - name: Install dependencies for tensorflow export
+        run: |
+          pip install .[tests,exporters-tf]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml
index 8fa4ebb045f..618a140c147 100644
--- a/.github/workflows/test_export_onnx_cli.yml
+++ b/.github/workflows/test_export_onnx_cli.yml
@@ -2,9 +2,9 @@ name: Exporters ONNX CLI / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,20 +15,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_cli_timm.yml b/.github/workflows/test_export_onnx_cli_timm.yml
index 76a535fcebd..b92d5551ba1 100644
--- a/.github/workflows/test_export_onnx_cli_timm.yml
+++ b/.github/workflows/test_export_onnx_cli_timm.yml
@@ -14,20 +14,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_timm.yml b/.github/workflows/test_export_onnx_timm.yml
index 339e3e93dec..c16d20fbc18 100644
--- a/.github/workflows/test_export_onnx_timm.yml
+++ b/.github/workflows/test_export_onnx_timm.yml
@@ -14,21 +14,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0
-
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0
diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml
index 8e8c3360c1f..11f6038afe4 100644
--- a/.github/workflows/test_exporters_common.yml
+++ b/.github/workflows/test_exporters_common.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml
index b22fdd7fd2a..453389d63fa 100644
--- a/.github/workflows/test_exporters_slow.yml
+++ b/.github/workflows/test_exporters_slow.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml
index f0366cf0d1e..a4e6dd3cd29 100644
--- a/.github/workflows/test_fx.yml
+++ b/.github/workflows/test_fx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml
index 90b0108e512..20911fe6db8 100644
--- a/.github/workflows/test_offline.yml
+++ b/.github/workflows/test_offline.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 22a11720798..dd1f3bee63d 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index a72bedb1ab7..0ab95752d01 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -17,8 +17,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        transformers-version: ["latest"]
         os: [ubuntu-20.04, windows-2019, macos-13]
+        include:
+          - transformers-version: "4.45.*"
+            os: ubuntu-20.04
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -33,10 +36,10 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Setup Python ${{ matrix.python-version }}
+      - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: 3.9
 
       - name: Install dependencies
         run: |
@@ -44,6 +47,10 @@ jobs:
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install .[tests,onnxruntime]
 
+      - name: Install transformers ${{ matrix.transformers-version }}
+        if: ${{ matrix.transformers-version != 'latest' }}
+        run: pip install transformers==${{ matrix.transformers-version }}
+
       - name: Test with pytest (in series)
         working-directory: tests
         run: |
diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
index 20371f79150..c5679e5b307 100644
--- a/.github/workflows/test_onnxruntime_slow.yml
+++ b/.github/workflows/test_onnxruntime_slow.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml
index ded149c9b69..5ad42807a5f 100644
--- a/.github/workflows/test_optimum_common.yml
+++ b/.github/workflows/test_optimum_common.yml
@@ -4,9 +4,9 @@ name: Optimum common / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,25 +17,24 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04, windows-2019, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install .[tests]
-        ls -l optimum/
-    - name: Test with unittest
-      shell: bash
-      run: |
-        # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel.
-        export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }}
-        pytest tests/test_*.py
-        
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[tests]
+          ls -l optimum/
+      - name: Test with unittest
+        shell: bash
+        run: |
+          # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel.
+          export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }}
+          pytest tests/test_*.py
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
index 1ef33ced086..b5f2e27fc6a 100644
--- a/.github/workflows/test_utils.yml
+++ b/.github/workflows/test_utils.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, macos-13]
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 22b8faf1c21..c8c91a04e4e 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -387,137 +387,243 @@ def opt_forward(
 
 
 # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward
-def t5_forward(
-    self,
-    hidden_states,
-    mask=None,
-    key_value_states=None,
-    position_bias=None,
-    past_key_value=None,
-    layer_head_mask=None,
-    query_length=None,
-    use_cache=False,
-    output_attentions=False,
-    **kwargs,
-):
-    raise_on_head_mask(layer_head_mask)
+if check_if_transformers_greater("4.45.99"):
 
-    if output_attentions is True:
-        raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
-    if len(self.pruned_heads) > 0:
-        raise ValueError(f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}.")
-    batch_size, seq_length = hidden_states.shape[:2]
-
-    real_seq_length = seq_length
-
-    if past_key_value is not None:
-        assert (
-            len(past_key_value) == 2
-        ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-        real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-
-    key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-    def shape(states):
-        """projection"""
-        return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-    def unshape(states):
-        """reshape"""
-        return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-    def project(hidden_states, proj_layer, key_value_states, past_key_value):
-        """projects hidden states correctly to key/query states"""
-        if key_value_states is None:
-            # self-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(hidden_states))
-        elif past_key_value is None:
-            # cross-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(key_value_states))
+    def t5_forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
+        is_cross_attention = key_value_states is not None
+
+        query_states = self.q(hidden_states)
+        query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
 
         if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                curr_past_key_value = past_key_value.cross_attention_cache
+            else:
+                curr_past_key_value = past_key_value.self_attention_cache
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_value is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.key_cache[self.layer_idx]
+            value_states = curr_past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self.k(current_states)
+            value_states = self.v(current_states)
+            key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_value.is_updated[self.layer_idx] = True
+
+        if position_bias is None:
+            key_length = key_states.shape[-2]
+            # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past)
+            real_seq_length = query_length if query_length is not None else cache_position[-1] + 1
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, seq_length, key_length), device=query_states.device, dtype=query_states.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, device=query_states.device, cache_position=cache_position
+                )
+                position_bias = position_bias[:, :, -seq_length:, :]
+
+            if mask is not None:
+                causal_mask = mask[:, :, :, : key_states.shape[-2]]
+                position_bias = position_bias + causal_mask
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=position_bias_masked,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=False,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, -1, self.inner_dim)
+        attn_output = self.o(attn_output)
+
+        outputs = (attn_output, past_key_value, position_bias)
+
+        return outputs
+
+else:
+
+    def t5_forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+        **kwargs,
+    ):
+        raise_on_head_mask(layer_head_mask)
+
+        if output_attentions is True:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+        if len(self.pruned_heads) > 0:
+            raise ValueError(
+                f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}."
+            )
+
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
             if key_value_states is None:
                 # self-attn
-                # (batch_size, n_heads, key_length, dim_per_head)
-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-            elif past_key_value.shape[2] != key_value_states.shape[1]:
-                # checking that the `sequence_length` of the `past_key_value` is the same as
-                # the provided `key_value_states` to support prefix tuning
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
                 # cross-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states))
-            else:
-                # cross-attn
-                hidden_states = past_key_value
-        return hidden_states
-
-    # get query states
-    query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
 
-    # get key/value states
-    key_states = project(
-        hidden_states,
-        self.k,
-        key_value_states,
-        past_key_value[0] if past_key_value is not None else None,
-    )
-    value_states = project(
-        hidden_states,
-        self.v,
-        key_value_states,
-        past_key_value[1] if past_key_value is not None else None,
-    )
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                elif past_key_value.shape[2] != key_value_states.shape[1]:
+                    # checking that the `sequence_length` of the `past_key_value` is the same as
+                    # the provided `key_value_states` to support prefix tuning
+                    # cross-attn
+                    # (batch_size, n_heads, seq_length, dim_per_head)
+                    hidden_states = shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states,
+            self.k,
+            key_value_states,
+            past_key_value[0] if past_key_value is not None else None,
+        )
+        value_states = project(
+            hidden_states,
+            self.v,
+            key_value_states,
+            past_key_value[1] if past_key_value is not None else None,
+        )
 
-    dropout_p = self.dropout if self.training else 0.0
-    query_states = self.scale * query_states
-    if position_bias is None and not self.has_relative_attention_bias:
-        if mask is None:
-            attn_output = torch.nn.functional.scaled_dot_product_attention(
-                query_states, key_states, value_states, attn_mask=None, dropout_p=dropout_p, is_causal=False
-            )
-        elif mask is not None:
+        dropout_p = self.dropout if self.training else 0.0
+        query_states = self.scale * query_states
+        if position_bias is None and not self.has_relative_attention_bias:
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states, key_states, value_states, attn_mask=mask, dropout_p=dropout_p, is_causal=False
             )
 
-    if position_bias is None:
-        if not self.has_relative_attention_bias:
-            position_bias = torch.zeros(
-                (1, self.n_heads, real_seq_length, key_length),
-                device=value_states.device,
-                dtype=value_states.dtype,
-            )
-            if self.gradient_checkpointing and self.training:
-                position_bias.requires_grad = True
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=value_states.device,
+                    dtype=value_states.dtype,
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+            if self.has_relative_attention_bias:
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_states,
+                    key_states,
+                    value_states,
+                    attn_mask=position_bias,
+                    dropout_p=dropout_p,
+                    is_causal=False,
+                )
         else:
-            position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device)
-
-        # if key and values are already calculated
-        # we want only the last query position bias
-        if past_key_value is not None:
-            position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-
-        if mask is not None:
-            position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        if self.has_relative_attention_bias:
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False
             )
-    else:
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False
-        )
 
-    attn_output = unshape(attn_output)  # (batch_size, seq_length, dim)
-    attn_output = self.o(attn_output)
+        attn_output = unshape(attn_output)  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
 
-    present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-    outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
 
-    return outputs
+        return outputs
 
 
 # Adapted from transformers.models.bart.modeling_bart.BartAttention.forward
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 52d28d076d3..e8045e695c1 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -327,9 +327,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             setattr(self, "relative_attention_bias", layer.relative_attention_bias)
             self.original_layers_mapping["relative_attention_bias"] = "relative_attention_bias"
 
-        self.module_mapping = None
-
+        self.layer_idx = getattr(layer, "layer_idx", None)
         self.is_decoder = layer.is_decoder
+        self.module_mapping = None
 
     def forward(self, *args, **kwargs):
         return t5_forward(self, *args, **kwargs)
diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index a101757b6fa..b138862752e 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -20,7 +20,13 @@
 import torch
 from packaging.version import parse
 
-from ..utils import check_if_pytorch_greater, is_accelerate_available, recurse_getattr, recurse_setattr
+from ..utils import (
+    check_if_pytorch_greater,
+    check_if_torch_greater,
+    is_accelerate_available,
+    recurse_getattr,
+    recurse_setattr,
+)
 from .models import BetterTransformerManager
 
 
@@ -213,15 +219,18 @@ def transform(
         hf_config = model.config
         if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]:
             raise ValueError(
-                f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention."
+                f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. "
+                "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. "
+                "Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. "
+                "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention."
             )
 
-        # Check if we have to load the model using `accelerate`
-        if hasattr(model, "hf_device_map"):
-            load_accelerate = True
-            hf_device_map = model.hf_device_map
-        else:
-            load_accelerate = False
+        if hasattr(hf_config, "_attn_implementation") and hf_config._attn_implementation == "sdpa":
+            raise ValueError(
+                "This model already uses BetterTransformer optimizations from Transformers (torch.nn.functional.scaled_dot_product_attention). "
+                "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. "
+                "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention."
+            )
 
         if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
             raise Exception(
@@ -241,11 +250,20 @@ def transform(
                 f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}."
             )
 
-        if parse(torch.__version__) <= parse("1.14"):
+        if not check_if_torch_greater("2.0"):
             raise ValueError(
                 f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch."
             )
 
+        hf_config = model.config
+
+        # Check if we have to load the model using `accelerate`
+        if hasattr(model, "hf_device_map"):
+            load_accelerate = True
+            hf_device_map = model.hf_device_map
+        else:
+            load_accelerate = False
+
         if load_accelerate:
             # Remove the hooks from the original model to avoid weights being on `meta` device.
             remove_hook_from_module(model, recurse=True)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index e77f649f69b..9e57128c272 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -155,7 +155,7 @@ class SplinterOnnxConfig(BertOnnxConfig):
 
 
 class DistilBertOnnxConfig(BertOnnxConfig):
-    DEFAULT_ONNX_OPSET = 11
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -266,10 +266,18 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-class OPTOnnxConfig(TextDecoderOnnxConfig):
-    # OPT does not require position_ids input.
-    DEFAULT_ONNX_OPSET = 13
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46
+if check_if_transformers_greater("4.45.99"):
+
+    class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
+        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+else:
+
+    class OPTOnnxConfig(TextDecoderOnnxConfig):
+        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
 class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -304,6 +312,15 @@ class Phi3OnnxConfig(PhiOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA
     MIN_TRANSFORMERS_VERSION = version.parse("4.41.0")
 
+    def __init__(self, *args, **kwargs):
+        # TODO : replace check_if_transformers_greater with is_transformers_available
+        if check_if_transformers_greater("4.46.0") and not check_if_transformers_greater("4.46.1"):
+            logger.error(
+                "Found transformers v4.46.0 while trying to exporting a Phi3 model, this specific version of transformers is not supported. "
+                "Please upgrade to v4.46.1 or higher, or downgrade your transformers version"
+            )
+        super().__init__(*args, **kwargs)
+
 
 class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35
@@ -480,7 +497,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class T5OnnxConfig(TextSeq2SeqOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # T5 uses aten::triu that requires opset>=14
     DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES[:-1] + (
         T5DummySeq2SeqPastKeyValuesGenerator,
     )
@@ -2027,6 +2044,7 @@ class TrOCROnnxConfig(TextSeq2SeqOnnxConfig):
 class VisionEncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
     ATOL_FOR_VALIDATION = 1e-3
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
 
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator)
 
@@ -2156,8 +2174,21 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast):
         DummySeq2SeqPastKeyValuesGenerator,
         DummyPix2StructInputGenerator,
     )
-    # Min operator needs to support int64, which is the case for opset>=12
-    DEFAULT_ONNX_OPSET = 12
+
+    DEFAULT_ONNX_OPSET = 14  # use 'aten::triu' now which is opset 14
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO : replace check_if_transformers_greater with is_transformers_available
+        if (
+            check_if_transformers_greater("4.46.0")
+            and not check_if_transformers_greater("4.46.1")
+            and self._behavior is ConfigBehavior.DECODER
+        ):
+            logger.error(
+                "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. "
+                "Please upgrade to v4.46.1 or higher, or downgrade your transformers version"
+            )
 
     @property
     def inputs(self):
@@ -2310,3 +2341,5 @@ def overwrite_shape_and_generate_input(
 
 class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
+
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 34ed5fcae46..fdfb0e280f5 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -34,11 +34,10 @@
 
 
 if _transformers_version > version.parse("4.34.99"):
-    from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 if _transformers_version >= version.parse("4.36"):
     from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
 else:
-    _prepare_4d_causal_attention_mask = None
     _prepare_4d_causal_attention_mask_for_sdpa = None
     AttentionMaskConverter = None
 
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 675566ba23e..56249bbf5c3 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -27,7 +27,7 @@
     is_diffusers_available,
     logging,
 )
-from ...utils.import_utils import _diffusers_version
+from ...utils.import_utils import _diffusers_version, check_if_transformers_greater
 from ..utils import (
     _get_submodels_and_export_configs,
 )
@@ -89,6 +89,10 @@
 }
 
 
+if check_if_transformers_greater("4.45.99"):
+    MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt")
+
+
 def check_onnxruntime_requirements(minimum_version: version.Version):
     """
     Checks that ONNX Runtime is installed and if version is recent enough.
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index bda3ec98d9a..984d7f22ebf 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -582,7 +582,8 @@ def _from_pretrained(
             init_cls = ORTFalconForCausalLM
         elif config.model_type == "mpt":
             init_cls = ORTMPTForCausalLM
-        elif config.model_type == "opt":
+        # if model was exported with position_ids it means the model was exported with transformers >= v4.46
+        elif config.model_type == "opt" and "position_ids" not in input_dims:
             init_cls = ORTOPTForCausalLM
         elif config.model_type == "gpt_bigcode":
             init_cls = ORTGPTBigCodeForCausalLM
@@ -839,7 +840,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
 
         attention_mask = kwargs.get("attention_mask", None)
         use_cache = kwargs.get("use_cache", None)
-
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 5d5044e63e1..db7d1f6975d 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -29,6 +29,7 @@
     TRANSFORMERS_MINIMUM_VERSION,
     check_if_diffusers_greater,
     check_if_pytorch_greater,
+    check_if_torch_greater,
     check_if_transformers_greater,
     is_accelerate_available,
     is_auto_gptq_available,
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 4a57fda79ce..35a6294ab52 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -193,6 +193,22 @@ def check_if_diffusers_greater(target_version: str) -> bool:
     return version.parse(_diffusers_version) >= version.parse(target_version)
 
 
+def check_if_torch_greater(target_version: str) -> bool:
+    """
+    Checks whether the current install of torch is greater than or equal to the target version.
+
+    Args:
+        target_version (str): version used as the reference for comparison.
+
+    Returns:
+        bool: whether the check is True or not.
+    """
+    if not is_torch_available():
+        return False
+
+    return torch_version >= version.parse(target_version)
+
+
 @contextmanager
 def require_numpy_strictly_lower(package_version: str, message: str):
     if not version.parse(np.__version__) < version.parse(package_version):
diff --git a/setup.py b/setup.py
index 822d8be1b80..82892bfcc8c 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29",
+    "transformers>=4.29",
     "torch>=1.11",
     "packaging",
     "numpy",
@@ -37,9 +37,9 @@
     "diffusers>=0.17.0",
     "torchaudio",
     "einops",
-    "invisible-watermark",
     "timm",
     "scikit-learn",
+    "sentencepiece",
     "rjieba",
 ]
 
@@ -54,7 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
-        "transformers<4.46.0",
+        "transformers<4.47.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -63,10 +63,20 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
-        "transformers<4.46.0",
+        "transformers<4.47.0",
+    ],
+    "exporters": [
+        "onnx",
+        "onnxruntime",
+        "timm",
+        "transformers<4.47.0",
+    ],
+    "exporters-gpu": [
+        "onnx",
+        "onnxruntime-gpu",
+        "timm",
+        "transformers<4.47.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -76,7 +86,7 @@
         "h5py",
         "numpy<1.24.0",
         "datasets<=2.16",
-        "transformers[sentencepiece]>=4.26,<4.38",
+        "transformers>=4.26,<4.38",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py
index be01a92d447..caca91e27ca 100644
--- a/tests/bettertransformer/test_audio.py
+++ b/tests/bettertransformer/test_audio.py
@@ -35,7 +35,7 @@
 
 class TestsWhisper(unittest.TestCase):
     def test_error_message(self):
-        model = AutoModel.from_pretrained("openai/whisper-tiny")
+        model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")
 
         with self.assertRaises(ValueError) as cm:
             model = BetterTransformer.transform(model)
@@ -82,15 +82,19 @@ def _test_fp16_inference(
         set_seed(0)
 
         if not use_to_operator:
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False)
 
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
         else:
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False)
 
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             hf_random_model = hf_random_model.to(torch.float16)
             converted_model = converted_model.to(torch.float16)
 
@@ -147,7 +151,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int):
         model_id = MODELS_DICT[model_type]
         processor = AutoProcessor.from_pretrained(model_id)
 
-        model = AutoModel.from_pretrained(model_id)
+        model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
 
         text = ["This is me and me"]
         if batch_size > 1:
@@ -217,14 +221,14 @@ def test_logits(self, model_type: str):
             inputs = self.prepare_inputs_for_class(model_id, model_type)
 
             torch.manual_seed(0)
-            hf_random_model = AutoModel.from_pretrained(model_id).eval()
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             random_config = hf_random_model.config
 
             torch.manual_seed(0)
             converted_model = BetterTransformer.transform(hf_random_model)
 
             torch.manual_seed(0)
-            hf_random_model = AutoModel.from_pretrained(model_id).eval()
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             random_config = hf_random_model.config
 
             self.assertFalse(
diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py
index 35b89d2ed2e..b8bc0a3b3d9 100644
--- a/tests/bettertransformer/test_common.py
+++ b/tests/bettertransformer/test_common.py
@@ -28,7 +28,7 @@
 
 class BetterTransformerIntegrationTests(unittest.TestCase):
     def test_raise_error_on_double_transform_call(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager")
 
         with self.assertRaises(Exception) as cm:
             bt_model = BetterTransformer.transform(model)
@@ -59,7 +59,7 @@ def test_raise_on_save(self, model_type: str):
         )
         for model_id in model_ids:
             with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
-                hf_model = AutoModel.from_pretrained(model_id).eval()
+                hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
                 bt_model = BetterTransformer.transform(hf_model, keep_original_model=False)
                 bt_model.save_pretrained(tmpdirname)
 
@@ -73,7 +73,7 @@ def test_conversion(self, model_type: str):
             MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
         )
         for model_id in model_ids:
-            hf_random_model = AutoModel.from_pretrained(model_id)
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
             converted_model = BetterTransformer.transform(hf_random_model)
 
             self.assertTrue(
@@ -99,7 +99,7 @@ def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep
         )
         for model_id in model_ids:
             # get hf and bt model
-            hf_model = AutoModel.from_pretrained(model_id)
+            hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
             # get bt model and invert it
             bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
 
@@ -145,9 +145,11 @@ def test_raise_activation_fun(self, model_type: str):
             )()  # random config class for the model to test
             hf_random_config.hidden_act = "silu"
 
-            hf_random_model = AutoModel.from_config(hf_random_config).eval()
+            hf_random_model = AutoModel.from_config(hf_random_config, attn_implementation="eager").eval()
+
             with self.assertRaises(ValueError) as cm:
                 _ = BetterTransformer.transform(hf_random_model, keep_original_model=True)
+
             self.assertTrue("Activation function" in str(cm.exception))
 
     def test_dict_class_consistency(self):
diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py
index bab8f376fcc..e2bc6ddc2fb 100644
--- a/tests/bettertransformer/test_decoder.py
+++ b/tests/bettertransformer/test_decoder.py
@@ -131,7 +131,7 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in
 
         model_id = MODELS_DICT[model_type]
 
-        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager")
 
         normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config)
 
@@ -167,7 +167,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
         model_id = MODELS_DICT[model_type]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager")
 
         if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
             if tokenizer.eos_token != "":
@@ -224,7 +224,9 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina
     @require_torch_gpu
     @require_accelerate
     def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None):
-        hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory).eval()
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            "gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager"
+        ).eval()
         bt_model = BetterTransformer.transform(
             hf_model, keep_original_model=keep_original_model, max_memory=max_memory
         )
diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py
index 74aacaed58c..7dd42c43b05 100644
--- a/tests/bettertransformer/test_encoder.py
+++ b/tests/bettertransformer/test_encoder.py
@@ -181,7 +181,9 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m
         If this works for roberta, it should work for all other models too.
         """
 
-        hf_model = AutoModel.from_pretrained("xlm-roberta-base", device_map="auto", max_memory=max_memory).eval()
+        hf_model = AutoModel.from_pretrained(
+            "xlm-roberta-base", device_map="auto", max_memory=max_memory, attn_implementation="eager"
+        ).eval()
         bt_model = BetterTransformer.transform(
             hf_model, keep_original_model=keep_original_model, max_memory=max_memory
         )
diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py
index 8d05923522a..5ce4d62b12c 100644
--- a/tests/bettertransformer/test_encoder_decoder.py
+++ b/tests/bettertransformer/test_encoder_decoder.py
@@ -153,7 +153,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
         model_id = MODELS_DICT[model_type]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_id, attn_implementation="eager")
 
         if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
diff --git a/tests/bettertransformer/test_gpu.py b/tests/bettertransformer/test_gpu.py
index b992b90d3c8..ada38e408fa 100644
--- a/tests/bettertransformer/test_gpu.py
+++ b/tests/bettertransformer/test_gpu.py
@@ -26,7 +26,9 @@ def timing_cuda(model, num_batches, input_ids, masks, decoder_input_ids):
 
 
 def benchmark(model_name: str, num_batches: int, batch_size: int, max_seqlen: int, is_half: bool):
-    hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16 if is_half else None).eval()
+    hf_model = AutoModel.from_pretrained(
+        model_name, torch_dtype=torch.float16 if is_half else None, attn_implementation="eager"
+    ).eval()
     hf_model = hf_model.to("cuda:0")
     bt_model = BetterTransformer.transform(hf_model, keep_original_model=True)
 
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index 098882180aa..f79cbb34512 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -136,10 +136,12 @@ def _test_fp16_inference(
 
         torch.manual_seed(0)
         if not use_to_operator:
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
         else:
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
             hf_random_model = hf_random_model.to(torch.float16)
             converted_model = converted_model.to(torch.float16)
@@ -169,7 +171,7 @@ def _test_fp16_inference(
     def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs):
         inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs)
 
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
         random_config = hf_random_model.config
 
         # I could not obtain reproducible results with `torch.manual_seed` nor with
@@ -309,7 +311,7 @@ def _test_train_decoder(self, model_id: str, model_type: str, **kwargs):
         """
         inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs)
 
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
 
         bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
         bt_model.train()
@@ -328,7 +330,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False):
         r"""
         Test that the inverse converted model and hf model have the same modules
         """
-        hf_model = AutoModel.from_pretrained(model_id)
+        hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
         hf_modules = list(hf_model.modules())
 
         bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
@@ -349,7 +351,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False):
 
     def _test_save_load_invertible(self, model_id, keep_original_model=True):
         with tempfile.TemporaryDirectory() as tmpdirname:
-            hf_model = AutoModel.from_pretrained(model_id).eval()
+            hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             hf_model_state_dict = copy.deepcopy(hf_model.state_dict())
 
             bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
@@ -362,7 +364,7 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True):
             # saving a normal transformers bark model fails because of shared tensors
             bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark")
 
-            bt_model_from_load = AutoModel.from_pretrained(tmpdirname)
+            bt_model_from_load = AutoModel.from_pretrained(tmpdirname, attn_implementation="eager")
 
             self.assertEqual(
                 set(bt_model.state_dict().keys()),
@@ -397,7 +399,7 @@ def _test_invert_model_logits(
         """
         inputs = self.prepare_inputs_for_class(model_id, model_type=model_type, **preprocessor_kwargs)
 
-        hf_model = AutoModel.from_pretrained(model_id)
+        hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
         hf_model = hf_model.eval()
 
         with torch.inference_mode():
diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py
index a144d5cd840..4398c14f01d 100644
--- a/tests/onnx/test_onnx_export_custom_module.py
+++ b/tests/onnx/test_onnx_export_custom_module.py
@@ -24,6 +24,8 @@
     import torch
     from transformers.models.deberta import modeling_deberta
 
+    from optimum.utils import check_if_torch_greater
+
 
 class StableDropoutTestCase(TestCase):
     """Tests export of StableDropout module."""
@@ -50,8 +52,8 @@ def test_training(self):
             training=training,
         )
 
-        # Expected to fail with opset_version < 12
-        with self.assertRaises(Exception):
+        if check_if_torch_greater("2.5"):
+            # Expected to pass with opset_version < 12 on torch >= 2.5
             torch.onnx.export(
                 sd,
                 input,
@@ -60,3 +62,14 @@ def test_training(self):
                 do_constant_folding=do_constant_folding,
                 training=training,
             )
+        else:
+            # Expected to fail with opset_version < 12 on torch < 2.5
+            with self.assertRaises(Exception):
+                torch.onnx.export(
+                    sd,
+                    input,
+                    devnull,
+                    opset_version=11,
+                    do_constant_folding=do_constant_folding,
+                    training=training,
+                )
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index da450b8e31c..597eb581e2a 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -54,6 +54,7 @@
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
     AutoTokenizer,
+    GenerationConfig,
     MBartForConditionalGeneration,
     Pix2StructForConditionalGeneration,  # Pix2Struct does not work with AutoModel
     PretrainedConfig,
@@ -106,7 +107,7 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
-from optimum.utils.import_utils import is_diffusers_available
+from optimum.utils.import_utils import check_if_transformers_greater, is_diffusers_available
 from optimum.utils.testing_utils import (
     grid_parameters,
     remove_directory,
@@ -2326,10 +2327,12 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "llama",
         "mistral",
         "mpt",
-        "phi3",
-        "qwen2",
+        "opt",
     ]
 
+    if check_if_transformers_greater("4.40"):
+        SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"])
+
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,
         "use_cache": [False, True],
@@ -2338,7 +2341,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
     ORTMODEL_CLASS = ORTModelForCausalLM
     TASK = "text-generation"
 
-    GENERATION_LENGTH = 100
+    GENERATION_LENGTH = 90
     SPEEDUP_CACHE = 1.1
 
     @parameterized.expand([(False,), (True,)])
@@ -2411,7 +2414,7 @@ def test_merge_from_onnx_and_save(self, model_arch):
             self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents)
             self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents)
 
-    @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 3]}))
+    @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 4]}))
     def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, num_beams: int):
         use_io_binding = None
         if use_cache is False:
@@ -2474,25 +2477,39 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
             # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers
             new_tokens = 5
 
-        onnx_outputs = onnx_model.generate(
-            **tokens,
-            num_beams=num_beams,
-            do_sample=False,
-            min_new_tokens=new_tokens,
-            max_new_tokens=new_tokens,
-            eos_token_id=None,
-        )
+        gen_kwargs = {
+            "max_new_tokens": new_tokens,
+            "min_new_tokens": new_tokens,
+            "eos_token_id": None,
+            "num_beams": num_beams,
+        }
 
-        transformers_outputs = transformers_model.generate(
-            **tokens,
-            num_beams=num_beams,
-            do_sample=False,
-            min_new_tokens=new_tokens,
-            max_new_tokens=new_tokens,
-            eos_token_id=None,
-        )
+        beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs)
+
+        if use_cache and num_beams == 4:
+            beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs)
+            group_beam_search_gen_config = GenerationConfig(
+                do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs
+            )
+            gen_configs = (
+                beam_search_gen_config,
+                beam_sample_gen_config,
+                group_beam_search_gen_config,
+            )
+        else:
+            gen_configs = (beam_search_gen_config,)
 
-        self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs))
+        for gen_config in gen_configs:
+            set_seed(SEED)
+            with torch.no_grad():
+                transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+            set_seed(SEED)
+            onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config)
+
+            self.assertTrue(
+                torch.equal(onnx_outputs, transformers_outputs),
+                f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}",
+            )
 
         gc.collect()
 
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 5071d0081af..e3d54237857 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -125,6 +125,7 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "lewtun/tiny-random-mt5",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
+    "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
     "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver",
     "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv",

From 7e8d857d1ed6be32046324bf8f424690f116b4e9 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <gabe.l.hart@gmail.com>
Date: Thu, 31 Oct 2024 14:54:05 -0600
Subject: [PATCH 33/50] Add ONNX export support for granite models (#2043)

* feat(exporters/onnx): Add GraniteOnnxConfig and task support list

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add granite's normalized config for inference

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(onnx opt): Add onnx optimization support for granite

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(onnx/granite): Use LlamaOnnxConfig as the base for GraniteOnnxConfig

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(onnxruntime): Add "granite" to list of model types with grouped attention

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Add granite to the list of models that require position_ids

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(granite): Add MIN_TORCH_VERSION for recently fixed torch bug

https://github.com/huggingface/optimum/pull/2043#issuecomment-2427975461

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* test(granite): Add tiny random granite test for onnx exporter

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* tests(onnxruntime): Add granite to onnxruntime tests

Branch: OnnxGranite

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 optimum/exporters/onnx/model_configs.py      | 5 +++++
 optimum/exporters/onnx/utils.py              | 1 +
 optimum/exporters/tasks.py                   | 7 +++++++
 optimum/onnxruntime/modeling_decoder.py      | 2 +-
 optimum/onnxruntime/utils.py                 | 1 +
 optimum/utils/normalized_config.py           | 1 +
 tests/exporters/exporters_utils.py           | 1 +
 tests/onnxruntime/test_modeling.py           | 1 +
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 9 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 9e57128c272..cc752779d30 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -298,6 +298,11 @@ class GemmaOnnxConfig(LlamaOnnxConfig):
     pass
 
 
+class GraniteOnnxConfig(LlamaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
+    MIN_TORCH_VERSION = version.parse("2.5.0")
+
+
 class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14  # Phi now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 56249bbf5c3..19e24f88743 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -86,6 +86,7 @@
     "phi",
     "phi3",
     "qwen2",
+    "granite",
 }
 
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index a489f34fb06..fdc8bfcb539 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -915,6 +915,13 @@ class TasksManager:
             "text-classification",
             onnx="LlamaOnnxConfig",
         ),
+        "granite": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            onnx="GraniteOnnxConfig",
+        ),
         "pegasus": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 984d7f22ebf..8f1d062221a 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -340,7 +340,7 @@ def prepare_past_key_values(
             if self.model_type == "gemma":
                 num_attention_heads = self.normalized_config.num_key_value_heads
                 embed_size_per_head = self.normalized_config.head_dim
-            elif self.model_type in {"mistral", "llama", "qwen2"}:
+            elif self.model_type in {"mistral", "llama", "qwen2", "granite"}:
                 num_attention_heads = self.normalized_config.num_key_value_heads
             else:
                 num_attention_heads = self.normalized_config.num_attention_heads
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 128e2406f11..9e92e0bd325 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -128,6 +128,7 @@ class ORTConfigManager:
         "gpt-neo": "gpt2",
         "gpt-neox": "gpt2",
         "gptj": "gpt2",
+        "granite": "gpt2",
         # longt5 with O4 results in segmentation fault
         "longt5": "bert",
         "llama": "gpt2",
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 81207b76496..9ceed24c2dd 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -281,6 +281,7 @@ class NormalizedConfigManager:
         "xlm-roberta": NormalizedTextConfig,
         "yolos": NormalizedVisionConfig,
         "qwen2": NormalizedTextConfig,
+        "granite": NormalizedTextConfigWithGQA,
     }
 
     @classmethod
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index c8a33b0be35..ccccb5510bf 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -100,6 +100,7 @@
     "gpt-neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt-neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel",
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 597eb581e2a..a335e014478 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2324,6 +2324,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "gpt_neo",
         "gpt_neox",
         "gptj",
+        "granite",
         "llama",
         "mistral",
         "mpt",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index e3d54237857..9f200e69b3d 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -104,6 +104,7 @@
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM",
+    "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",

From 35eebfe62bf721bbab365f569bd0c73057239732 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:01:46 +0100
Subject: [PATCH 34/50] Drop python 3.8 (#2086)

* drop python 3.8

* fix

* add python 3.11
---
 .github/workflows/dev_test_benckmark.yml                  | 8 ++------
 .github/workflows/dev_test_bettertransformer.yml          | 6 ++----
 .github/workflows/dev_test_dummy_inputs.yml               | 4 +---
 .github/workflows/dev_test_exporters.yml                  | 8 ++------
 .github/workflows/dev_test_fx.yml                         | 4 +---
 .github/workflows/dev_test_onnx.yml                       | 4 +---
 .github/workflows/dev_test_onnxruntime.yml                | 4 +---
 .github/workflows/dev_test_optimum_common.yml             | 5 +----
 .github/workflows/test_export_onnx.yml                    | 2 +-
 .github/workflows/test_export_tflite.yml                  | 5 ++---
 .github/workflows/test_export_tflite_cli.yml              | 5 ++---
 .../test_export_tflite_cli_dynamic_quantization_int8.yml  | 5 ++---
 .../test_export_tflite_cli_quantization_fp16.yml          | 5 ++---
 .../test_export_tflite_cli_quantization_full_int8.yml     | 5 ++---
 ...export_tflite_cli_quantization_int8_custom_dataset.yml | 5 ++---
 ...xport_tflite_cli_quantization_int8_default_dataset.yml | 5 ++---
 .../test_export_tflite_cli_quantization_int8x16.yml       | 5 ++---
 .github/workflows/test_exporters_common.yml               | 5 ++---
 .github/workflows/test_exporters_slow.yml                 | 5 ++---
 .github/workflows/test_fx.yml                             | 2 +-
 .github/workflows/test_offline.yml                        | 5 ++---
 .github/workflows/test_onnx.yml                           | 2 +-
 .github/workflows/test_onnxruntime.yml                    | 2 +-
 .github/workflows/test_onnxruntime_slow.yml               | 2 +-
 .github/workflows/test_optimum_common.yml                 | 4 ++--
 .github/workflows/test_utils.yml                          | 2 +-
 setup.py                                                  | 7 ++++---
 27 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml
index 5f6fc825021..a898d288625 100644
--- a/.github/workflows/dev_test_benckmark.yml
+++ b/.github/workflows/dev_test_benckmark.yml
@@ -12,12 +12,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
-        os:
-        - ubuntu-20.04
-    runs-on: ${{ matrix.os }}
+        python-version: ['3.9', '3.11']
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml
index e4c999ca6da..e75b5e3bf98 100644
--- a/.github/workflows/dev_test_bettertransformer.yml
+++ b/.github/workflows/dev_test_bettertransformer.yml
@@ -12,18 +12,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
         os:
         - ubuntu-20.04
         - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
+    - name: Setup Python
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: '3.9'
     - name: Install dependencies
       run: |
         pip install .[tests]
diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml
index 49baa49c418..72a4763e432 100644
--- a/.github/workflows/dev_test_dummy_inputs.yml
+++ b/.github/workflows/dev_test_dummy_inputs.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_exporters.yml b/.github/workflows/dev_test_exporters.yml
index 5d967d125f5..b2dee3ed3a9 100644
--- a/.github/workflows/dev_test_exporters.yml
+++ b/.github/workflows/dev_test_exporters.yml
@@ -12,12 +12,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
-        os:
-        - ubuntu-20.04
-    runs-on: ${{ matrix.os }}
+        python-version: ['3.9', '3.11']
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml
index 0b8633282f7..a0c54c78365 100644
--- a/.github/workflows/dev_test_fx.yml
+++ b/.github/workflows/dev_test_fx.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml
index 48052cfded3..f7514e1c5e5 100644
--- a/.github/workflows/dev_test_onnx.yml
+++ b/.github/workflows/dev_test_onnx.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml
index 857028ab2db..c9104ebbd6c 100644
--- a/.github/workflows/dev_test_onnxruntime.yml
+++ b/.github/workflows/dev_test_onnxruntime.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - windows-2019
diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml
index 807ed0b1dab..117db50437b 100644
--- a/.github/workflows/dev_test_optimum_common.yml
+++ b/.github/workflows/dev_test_optimum_common.yml
@@ -12,10 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.7
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - windows-2019
diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml
index 0cd19a1724c..d1fd4a9723f 100644
--- a/.github/workflows/test_export_onnx.yml
+++ b/.github/workflows/test_export_onnx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_export_tflite.yml b/.github/workflows/test_export_tflite.yml
index 362390b166d..225a28c1cba 100644
--- a/.github/workflows/test_export_tflite.yml
+++ b/.github/workflows/test_export_tflite.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli.yml b/.github/workflows/test_export_tflite_cli.yml
index e14e4cde325..cfca58cf9c1 100644
--- a/.github/workflows/test_export_tflite_cli.yml
+++ b/.github/workflows/test_export_tflite_cli.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
index 7e4a83b3b7b..9cebe8ac0f6 100644
--- a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
+++ b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
index 981dd005e52..ca35ad8b3eb 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
index 9064bfaf315..1531ffa5c9c 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
index 824e8933a08..7274d09c0f8 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
index e975997e379..6c8639ebfe0 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
index ef59cff0b92..39902d0dd50 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml
index 11f6038afe4..801e0bebc55 100644
--- a/.github/workflows/test_exporters_common.yml
+++ b/.github/workflows/test_exporters_common.yml
@@ -15,10 +15,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml
index 453389d63fa..b5f142fc7dc 100644
--- a/.github/workflows/test_exporters_slow.yml
+++ b/.github/workflows/test_exporters_slow.yml
@@ -14,10 +14,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml
index a4e6dd3cd29..0a1890cc715 100644
--- a/.github/workflows/test_fx.yml
+++ b/.github/workflows/test_fx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml
index 20911fe6db8..29b7b183bd7 100644
--- a/.github/workflows/test_offline.yml
+++ b/.github/workflows/test_offline.yml
@@ -15,10 +15,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index dd1f3bee63d..418a9e42c1a 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 0ab95752d01..089300f7cd9 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: '3.9'
 
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
index c5679e5b307..89d44e57ad1 100644
--- a/.github/workflows/test_onnxruntime_slow.yml
+++ b/.github/workflows/test_onnxruntime_slow.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml
index 5ad42807a5f..9aab45e4b71 100644
--- a/.github/workflows/test_optimum_common.yml
+++ b/.github/workflows/test_optimum_common.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04, windows-2019, macos-13]
 
     runs-on: ${{ matrix.os }}
@@ -36,5 +36,5 @@ jobs:
         shell: bash
         run: |
           # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel.
-          export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }}
+          export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.9' && matrix.os == 'ubuntu-20.04' }}
           pytest tests/test_*.py
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
index b5f2e27fc6a..0126b023c60 100644
--- a/.github/workflows/test_utils.yml
+++ b/.github/workflows/test_utils.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, macos-13]
-        python-version: [3.9]
+        python-version: ['3.9']
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/setup.py b/setup.py
index 82892bfcc8c..7ea0da56c29 100644
--- a/setup.py
+++ b/setup.py
@@ -123,9 +123,10 @@
         "Intended Audience :: Education",
         "Intended Audience :: Science/Research",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     keywords="transformers, quantization, pruning, optimization, training, inference, onnx, onnx runtime, intel, "
@@ -137,7 +138,7 @@
     packages=find_namespace_packages(include=["optimum*"]),
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
-    python_requires=">=3.7.0",
+    python_requires=">=3.9.0",
     include_package_data=True,
     zip_safe=False,
     entry_points={"console_scripts": ["optimum-cli=optimum.commands.optimum_cli:main"]},

From e8b03321035ea19001bcbb773444e3f0574d4150 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 15 Nov 2024 17:15:33 +0100
Subject: [PATCH 35/50] Update Dockerfile base image (#2089)

upgrade base image
---
 docs/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Dockerfile b/docs/Dockerfile
index 29ea0f916ce..d76dc50c556 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM nikolaik/python-nodejs:python3.8-nodejs18
+FROM nikolaik/python-nodejs:python3.9-nodejs18
 
 ARG commit_sha
 ARG clone_url

From c513437511e51ccedb4f28c30e6aea9c0cf76a4a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 18 Nov 2024 13:47:29 +0100
Subject: [PATCH 36/50] Add transformers 4.36 tests (#2085)

* add transformers 4.36 tests

* add test depending on tranformers version

* add min transformers required version for gemma

* update macos

* fix whisper test

* add opt

* fix mpt

* add comment

* add granite testwhen supported by transformers
---
 .github/workflows/test_onnxruntime.yml  |  4 ++-
 optimum/exporters/onnx/model_configs.py |  4 ++-
 setup.py                                | 10 +++----
 tests/onnxruntime/test_modeling.py      | 37 +++++++++++++++----------
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 089300f7cd9..fec5c7e5b27 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -18,8 +18,10 @@ jobs:
       fail-fast: false
       matrix:
         transformers-version: ["latest"]
-        os: [ubuntu-20.04, windows-2019, macos-13]
+        os: [ubuntu-20.04, windows-2019, macos-15]
         include:
+          - transformers-version: "4.36.*"
+            os: ubuntu-20.04
           - transformers-version: "4.45.*"
             os: ubuntu-20.04
 
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index cc752779d30..6b92109b7b6 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -295,7 +295,7 @@ class Qwen2OnnxConfig(LlamaOnnxConfig):
 class GemmaOnnxConfig(LlamaOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
-    pass
+    MIN_TRANSFORMERS_VERSION = version.parse("4.38.0")
 
 
 class GraniteOnnxConfig(LlamaOnnxConfig):
@@ -348,6 +348,8 @@ def patch_model_for_export(
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
     DEFAULT_ONNX_OPSET = 13
+    # TODO: fix inference for transformers < v4.41 for beam_search > 1
+    MIN_TRANSFORMERS_VERSION = version.parse("4.41.0")
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers"
     )
diff --git a/setup.py b/setup.py
index 7ea0da56c29..29f97b604e0 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
-        "transformers<4.47.0",
+        "transformers>=4.36,<4.47.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -63,19 +63,19 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
-        "transformers<4.47.0",
+        "transformers>=4.36,<4.47.0",
     ],
     "exporters": [
         "onnx",
         "onnxruntime",
         "timm",
-        "transformers<4.47.0",
+        "transformers>=4.36,<4.47.0",
     ],
     "exporters-gpu": [
         "onnx",
         "onnxruntime-gpu",
         "timm",
-        "transformers<4.47.0",
+        "transformers>=4.36,<4.47.0",
     ],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
@@ -86,7 +86,7 @@
         "h5py",
         "numpy<1.24.0",
         "datasets<=2.16",
-        "transformers>=4.26,<4.38",
+        "transformers>=4.36,<4.38",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index a335e014478..84ac27029f9 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2318,21 +2318,28 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "bloom",
         "codegen",
         "falcon",
-        "gemma",
         "gpt2",
         "gpt_bigcode",
         "gpt_neo",
         "gpt_neox",
         "gptj",
-        "granite",
         "llama",
         "mistral",
-        "mpt",
         "opt",
     ]
 
-    if check_if_transformers_greater("4.40"):
-        SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"])
+    if check_if_transformers_greater("4.37"):
+        SUPPORTED_ARCHITECTURES.append("qwen2")
+
+    if check_if_transformers_greater("4.38"):
+        SUPPORTED_ARCHITECTURES.append("gemma")
+
+    # TODO: fix "mpt" for which inference fails for transformers < v4.41
+    if check_if_transformers_greater("4.41"):
+        SUPPORTED_ARCHITECTURES.extend(["phi3", "mpt"])
+
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES.append("granite")
 
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,
@@ -2445,7 +2452,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
         transformers_model = transformers_model.eval()
         tokenizer = get_preprocessor(model_id)
-        tokens = tokenizer("This is a sample output", return_tensors="pt")
+        tokens = tokenizer("This is a sample input", return_tensors="pt")
         position_ids = None
         if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
             input_shape = tokens["input_ids"].shape
@@ -2467,7 +2474,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         # Compare batched generation.
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokenizer.padding_side = "left"
-        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        tokens = tokenizer(["This is", "This is a sample input"], return_tensors="pt", padding=True)
         onnx_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         onnx_model.config.eos_token_id = None
@@ -4598,14 +4605,14 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str):
             )
 
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
-        self.assertEqual(
-            outputs_model_with_pkv.shape[1],
-            self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1,
-        )
-        self.assertEqual(
-            outputs_model_without_pkv.shape[1],
-            self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1,
-        )
+
+        if model_arch == "whisper" and check_if_transformers_greater("4.43"):
+            gen_length = self.GENERATION_LENGTH + 2
+        else:
+            gen_length = self.GENERATION_LENGTH + 1
+
+        self.assertEqual(outputs_model_with_pkv.shape[1], gen_length)
+        self.assertEqual(outputs_model_without_pkv.shape[1], gen_length)
 
         self.GENERATION_LENGTH = generation_length
         if os.environ.get("TEST_LEVEL", 0) == "1":

From 400bb82f312016b0a31b342d48b00d031786417d Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 18 Nov 2024 15:05:37 +0100
Subject: [PATCH 37/50] [`fix`] Allow ORTQuantizer over models with subfolder
 ONNX files (#2094)

* Allow ORTQuantizer over models with subfolder ONNX files

* Also catch ValueError as that seems a common fail when AutoConfig.from_pretrained("does/not/exist")

* Use test case that previously failed
---
 optimum/onnxruntime/quantization.py    | 9 +++++----
 tests/onnxruntime/test_quantization.py | 8 ++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
index 056123f8d8e..f637916dcd2 100644
--- a/optimum/onnxruntime/quantization.py
+++ b/optimum/onnxruntime/quantization.py
@@ -100,7 +100,7 @@ def __init__(self, onnx_model_path: Path, config: Optional["PretrainedConfig"] =
         if self.config is None:
             try:
                 self.config = AutoConfig.from_pretrained(self.onnx_model_path.parent)
-            except OSError:
+            except (OSError, ValueError):
                 LOGGER.warning(
                     f"Could not load the config for {self.onnx_model_path} automatically, this might make "
                     "the quantized model harder to use because it will not be able to be loaded by an ORTModel without "
@@ -134,6 +134,7 @@ def from_pretrained(
             model_or_path = Path(model_or_path)
 
         path = None
+        config = None
         if isinstance(model_or_path, ORTModelForConditionalGeneration):
             raise NotImplementedError(ort_quantizer_error_message)
         elif isinstance(model_or_path, Path) and file_name is None:
@@ -147,13 +148,13 @@ def from_pretrained(
             file_name = onnx_files[0].name
 
         if isinstance(model_or_path, ORTModel):
-            if path is None:
-                path = Path(model_or_path.model._model_path)
+            path = Path(model_or_path.model._model_path)
+            config = model_or_path.config
         elif os.path.isdir(model_or_path):
             path = Path(model_or_path) / file_name
         else:
             raise ValueError(f"Unable to load model from {model_or_path}.")
-        return cls(path)
+        return cls(path, config=config)
 
     def fit(
         self,
diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py
index b6f1ebb70f6..34a9504f95a 100644
--- a/tests/onnxruntime/test_quantization.py
+++ b/tests/onnxruntime/test_quantization.py
@@ -30,6 +30,7 @@
     AutoQuantizationConfig,
     ORTConfig,
     ORTModelForCausalLM,
+    ORTModelForFeatureExtraction,
     ORTModelForSeq2SeqLM,
     ORTModelForSequenceClassification,
     ORTQuantizer,
@@ -52,6 +53,13 @@ class ORTQuantizerTest(unittest.TestCase):
                 "optimum/distilbert-base-uncased-finetuned-sst-2-english"
             )
         },
+        "ort_model_with_onnx_model_in_subfolder": {
+            "model_or_path": ORTModelForFeatureExtraction.from_pretrained(
+                "sentence-transformers/all-MiniLM-L6-v2",
+                subfolder="onnx",
+                file_name="model.onnx",
+            )
+        },
     }
 
     @parameterized.expand(LOAD_CONFIGURATION.items())

From a7a807c9e712fd9669865358e34c1de072b78d8e Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 19 Nov 2024 13:10:57 +0100
Subject: [PATCH 38/50] SD3 and Flux support (#2073)

* sd3 support

* unsupported cli model types

* flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests

* fixes

* move input generators

* dummy diffusers

* style

* sd3 support

* unsupported cli model types

* flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests

* fixes

* move input generators

* dummy diffusers

* style

* distribute ort tests

* fix

* fix

* fix

* test num images

* single process to reduce re-exports

* test

* revert unnecessary changes

* T5Encoder inherits from TextEncoder

* style

* fix typo in timestep

* style

* only test sd3 and flux on latest transformers

* conditional sd3 and flux modeling

* forgot sd3 inpaint
---
 .github/workflows/test_onnxruntime.yml       |  13 +-
 optimum/exporters/onnx/base.py               |   1 +
 optimum/exporters/onnx/convert.py            |   4 +
 optimum/exporters/onnx/model_configs.py      | 123 +++++++++--
 optimum/exporters/tasks.py                   |  29 ++-
 optimum/exporters/utils.py                   | 190 +++++++++++------
 optimum/onnxruntime/__init__.py              |  72 +++++--
 optimum/onnxruntime/modeling_diffusion.py    | 202 +++++++++++++++++--
 optimum/utils/__init__.py                    |   7 +
 optimum/utils/constant.py                    |   4 +-
 optimum/utils/dummy_diffusers_objects.py     |  74 ++++++-
 optimum/utils/input_generators.py            |  81 +++++++-
 tests/exporters/exporters_utils.py           |   4 +-
 tests/exporters/onnx/test_onnx_export.py     |   2 -
 tests/onnxruntime/test_diffusion.py          | 192 +++++++++++-------
 tests/onnxruntime/test_modeling.py           |   2 +-
 tests/onnxruntime/test_quantization.py       |   4 +-
 tests/onnxruntime/utils_onnxruntime_tests.py |   4 +-
 18 files changed, 791 insertions(+), 217 deletions(-)

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index fec5c7e5b27..b20a3b46f88 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -26,14 +26,11 @@ jobs:
             os: ubuntu-20.04
 
     runs-on: ${{ matrix.os }}
+
     steps:
       - name: Free Disk Space (Ubuntu)
         if: matrix.os == 'ubuntu-20.04'
         uses: jlumbroso/free-disk-space@main
-        with:
-          tool-cache: false
-          swap-storage: false
-          large-packages: false
 
       - name: Checkout code
         uses: actions/checkout@v4
@@ -54,13 +51,11 @@ jobs:
         run: pip install transformers==${{ matrix.transformers-version }}
 
       - name: Test with pytest (in series)
-        working-directory: tests
         run: |
-          pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s
+          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s
 
       - name: Test with pytest (in parallel)
+        run: |
+          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
         env:
           HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-        working-directory: tests
-        run: |
-          pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index 8cd94194ffe..7e35691d54b 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -319,6 +319,7 @@ def fix_dynamic_axes(
                 input_shapes = {}
             dummy_inputs = self.generate_dummy_inputs(framework="np", **input_shapes)
             dummy_inputs = self.generate_dummy_inputs_for_validation(dummy_inputs, onnx_input_names=onnx_input_names)
+            dummy_inputs = self.rename_ambiguous_inputs(dummy_inputs)
 
             onnx_inputs = {}
             for name, value in dummy_inputs.items():
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 2661d835979..c12a9ac222a 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -1183,6 +1183,10 @@ def onnx_export_from_model(
         if tokenizer_2 is not None:
             tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
 
+        tokenizer_3 = getattr(model, "tokenizer_3", None)
+        if tokenizer_3 is not None:
+            tokenizer_3.save_pretrained(output.joinpath("tokenizer_3"))
+
         model.save_config(output)
 
     if float_dtype == "bf16":
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 6b92109b7b6..8984162ee8c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Model specific ONNX configurations."""
+
 import random
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
@@ -28,6 +29,8 @@
     DummyCodegenDecoderTextInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
+    DummyFluxTransformerTextInputGenerator,
+    DummyFluxTransformerVisionInputGenerator,
     DummyInputGenerator,
     DummyIntGenerator,
     DummyPastKeyValuesGenerator,
@@ -38,6 +41,9 @@
     DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
+    DummyTransformerTextInputGenerator,
+    DummyTransformerTimestepInputGenerator,
+    DummyTransformerVisionInputGenerator,
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
@@ -53,6 +59,7 @@
     NormalizedTextConfig,
     NormalizedTextConfigWithGQA,
     NormalizedVisionConfig,
+    check_if_diffusers_greater,
     check_if_transformers_greater,
     is_diffusers_available,
     logging,
@@ -1039,22 +1046,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
             "pooler_output": {0: "batch_size"},
         }
+
         if self._normalized_config.output_hidden_states:
             for i in range(self._normalized_config.num_layers + 1):
                 common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
 
         return common_outputs
 
-    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
-        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
-
-        # TODO: fix should be by casting inputs during inference and not export
-        if framework == "pt":
-            import torch
-
-            dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32)
-        return dummy_inputs
-
     def patch_model_for_export(
         self,
         model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
@@ -1064,7 +1062,7 @@ def patch_model_for_export(
 
 
 class UNetOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-3
+    ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1087,17 +1085,19 @@ class UNetOnnxConfig(VisionOnnxConfig):
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = {
             "sample": {0: "batch_size", 2: "height", 3: "width"},
-            "timestep": {0: "steps"},
+            "timestep": {},  # a scalar with no dimension
             "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
         }
 
-        # TODO : add text_image, image and image_embeds
+        # TODO : add addition_embed_type == text_image, image and image_embeds
+        # https://github.com/huggingface/diffusers/blob/9366c8f84bfe47099ff047272661786ebb54721d/src/diffusers/models/unets/unet_2d_condition.py#L671
         if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
             common_inputs["text_embeds"] = {0: "batch_size"}
             common_inputs["time_ids"] = {0: "batch_size"}
 
         if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None:
             common_inputs["timestep_cond"] = {0: "batch_size"}
+
         return common_inputs
 
     @property
@@ -1136,7 +1136,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
 
 
 class VaeEncoderOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-4
+    ATOL_FOR_VALIDATION = 3e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1184,6 +1184,101 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         }
 
 
+class T5EncoderOnnxConfig(TextEncoderOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 12  # int64 was supported since opset 12
+
+    @property
+    def inputs(self):
+        return {
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+        }
+
+    @property
+    def outputs(self):
+        return {
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+
+
+class SD3TransformerOnnxConfig(VisionOnnxConfig):
+    ATOL_FOR_VALIDATION = 1e-4
+    # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
+    # operator support, available since opset 14
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTransformerTimestepInputGenerator,
+        DummyTransformerVisionInputGenerator,
+        DummyTransformerTextInputGenerator,
+    )
+
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
+        image_size="sample_size",
+        num_channels="in_channels",
+        vocab_size="attention_head_dim",
+        hidden_size="joint_attention_dim",
+        projection_size="pooled_projection_dim",
+        allow_new=True,
+    )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = {
+            "hidden_states": {0: "batch_size", 2: "height", 3: "width"},
+            "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
+            "pooled_projections": {0: "batch_size"},
+            "timestep": {0: "step"},
+        }
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "out_hidden_states": {0: "batch_size", 2: "height", 3: "width"},
+        }
+
+    @property
+    def torch_to_onnx_output_map(self) -> Dict[str, str]:
+        return {
+            "sample": "out_hidden_states",
+        }
+
+
+class FluxTransformerOnnxConfig(SD3TransformerOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTransformerTimestepInputGenerator,
+        DummyFluxTransformerVisionInputGenerator,
+        DummyFluxTransformerTextInputGenerator,
+    )
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"}
+        common_inputs["txt_ids"] = (
+            {0: "sequence_length"} if check_if_diffusers_greater("0.31.0") else {0: "batch_size", 1: "sequence_length"}
+        )
+        common_inputs["img_ids"] = (
+            {0: "packed_height_width"}
+            if check_if_diffusers_greater("0.31.0")
+            else {0: "batch_size", 1: "packed_height_width"}
+        )
+
+        if getattr(self._normalized_config, "guidance_embeds", False):
+            common_inputs["guidance"] = {0: "batch_size"}
+
+        return common_inputs
+
+    @property
+    def outputs(self):
+        return {
+            "out_hidden_states": {0: "batch_size", 1: "packed_height_width"},
+        }
+
+
 class GroupViTOnnxConfig(CLIPOnnxConfig):
     pass
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index fdc8bfcb539..b4bce4696f3 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -335,7 +335,11 @@ class TasksManager:
     }
 
     _DIFFUSERS_SUPPORTED_MODEL_TYPE = {
-        "clip-text-model": supported_tasks_mapping(
+        "t5-encoder": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="T5EncoderOnnxConfig",
+        ),
+        "clip-text": supported_tasks_mapping(
             "feature-extraction",
             onnx="CLIPTextOnnxConfig",
         ),
@@ -343,7 +347,15 @@ class TasksManager:
             "feature-extraction",
             onnx="CLIPTextWithProjectionOnnxConfig",
         ),
-        "unet": supported_tasks_mapping(
+        "flux-transformer-2d": supported_tasks_mapping(
+            "semantic-segmentation",
+            onnx="FluxTransformerOnnxConfig",
+        ),
+        "sd3-transformer-2d": supported_tasks_mapping(
+            "semantic-segmentation",
+            onnx="SD3TransformerOnnxConfig",
+        ),
+        "unet-2d-condition": supported_tasks_mapping(
             "semantic-segmentation",
             onnx="UNetOnnxConfig",
         ),
@@ -1177,12 +1189,17 @@ class TasksManager:
         "transformers": _SUPPORTED_MODEL_TYPE,
     }
     _UNSUPPORTED_CLI_MODEL_TYPE = {
-        "unet",
+        # diffusers model types
+        "clip-text",
+        "clip-text-with-projection",
+        "flux-transformer-2d",
+        "sd3-transformer-2d",
+        "t5-encoder",
+        "unet-2d-condition",
         "vae-encoder",
         "vae-decoder",
-        "clip-text-model",
-        "clip-text-with-projection",
-        "trocr",  # supported through the vision-encoder-decoder model type
+        # redundant model types
+        "trocr",  # same as vision-encoder-decoder
     }
     _SUPPORTED_CLI_MODEL_TYPE = (
         set(_SUPPORTED_MODEL_TYPE.keys())
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index 949b54f4685..60de169de5e 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -15,7 +15,6 @@
 
 """Utilities for model preparation to export."""
 
-
 import copy
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -44,12 +43,7 @@
             "Please update diffusers by running `pip install --upgrade diffusers`"
         )
 
-    from diffusers import (
-        DiffusionPipeline,
-        StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLPipeline,
-    )
+    from diffusers import DiffusionPipeline
     from diffusers.models.attention_processor import (
         Attention,
         AttnAddedKVProcessor,
@@ -80,6 +74,20 @@
 DECODER_MERGED_NAME = "decoder_model_merged"
 
 
+_DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE = {
+    "CLIPTextModel": "clip-text",
+    "CLIPTextModelWithProjection": "clip-text-with-projection",
+    "FluxTransformer2DModel": "flux-transformer-2d",
+    "SD3Transformer2DModel": "sd3-transformer-2d",
+    "UNet2DConditionModel": "unet-2d-condition",
+    "T5EncoderModel": "t5-encoder",
+}
+
+
+def _get_diffusers_submodel_type(submodel):
+    return _DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE.get(submodel.__class__.__name__)
+
+
 def _get_submodels_for_export_diffusion(
     pipeline: "DiffusionPipeline",
 ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
@@ -87,56 +95,87 @@ def _get_submodels_for_export_diffusion(
     Returns the components of a Stable Diffusion model.
     """
 
-    is_stable_diffusion_xl = isinstance(
-        pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline)
-    )
-    if is_stable_diffusion_xl:
-        projection_dim = pipeline.text_encoder_2.config.projection_dim
-    else:
-        projection_dim = pipeline.text_encoder.config.projection_dim
-
     models_for_export = {}
 
+    is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0")
+    is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
+    is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
+
     # Text encoder
     text_encoder = getattr(pipeline, "text_encoder", None)
     if text_encoder is not None:
-        if is_stable_diffusion_xl:
+        if is_sdxl or is_sd3:
             text_encoder.config.output_hidden_states = True
+            text_encoder.text_model.config.output_hidden_states = True
+
+        text_encoder.config.export_model_type = _get_diffusers_submodel_type(text_encoder)
         models_for_export["text_encoder"] = text_encoder
 
-    # U-NET
-    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
-    is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0")
-    if not is_torch_greater_or_equal_than_2_1:
-        pipeline.unet.set_attn_processor(AttnProcessor())
+    # Text encoder 2
+    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
+    if text_encoder_2 is not None:
+        if is_sdxl or is_sd3:
+            text_encoder_2.config.output_hidden_states = True
+            text_encoder_2.text_model.config.output_hidden_states = True
 
-    pipeline.unet.config.text_encoder_projection_dim = projection_dim
-    # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
-    # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
-    pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
-    models_for_export["unet"] = pipeline.unet
+        text_encoder_2.config.export_model_type = _get_diffusers_submodel_type(text_encoder_2)
+        models_for_export["text_encoder_2"] = text_encoder_2
 
-    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    # Text encoder 3
+    text_encoder_3 = getattr(pipeline, "text_encoder_3", None)
+    if text_encoder_3 is not None:
+        text_encoder_3.config.export_model_type = _get_diffusers_submodel_type(text_encoder_3)
+        models_for_export["text_encoder_3"] = text_encoder_3
+
+    # U-NET
+    unet = getattr(pipeline, "unet", None)
+    if unet is not None:
+        # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
+        if not is_torch_greater_or_equal_than_2_1:
+            unet.set_attn_processor(AttnProcessor())
+
+        # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
+        # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
+        unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+        unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None)
+        unet.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim
+        unet.config.export_model_type = _get_diffusers_submodel_type(unet)
+        models_for_export["unet"] = unet
+
+    # Transformer
+    transformer = getattr(pipeline, "transformer", None)
+    if transformer is not None:
+        # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
+        if not is_torch_greater_or_equal_than_2_1:
+            transformer.set_attn_processor(AttnProcessor())
+
+        transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+        transformer.config.time_cond_proj_dim = getattr(pipeline.transformer.config, "time_cond_proj_dim", None)
+        transformer.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim
+        transformer.config.export_model_type = _get_diffusers_submodel_type(transformer)
+        models_for_export["transformer"] = transformer
+
+    # VAE Encoder
     vae_encoder = copy.deepcopy(pipeline.vae)
+
+    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
     if not is_torch_greater_or_equal_than_2_1:
         vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder)
+
     # we return the distribution parameters to be able to recreate it in the decoder
     vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
     models_for_export["vae_encoder"] = vae_encoder
 
-    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    # VAE Decoder
     vae_decoder = copy.deepcopy(pipeline.vae)
+
+    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
     if not is_torch_greater_or_equal_than_2_1:
         vae_decoder = override_diffusers_2_0_attn_processors(vae_decoder)
+
     vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
     models_for_export["vae_decoder"] = vae_decoder
 
-    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
-    if text_encoder_2 is not None:
-        text_encoder_2.config.output_hidden_states = True
-        text_encoder_2.text_model.config.output_hidden_states = True
-        models_for_export["text_encoder_2"] = text_encoder_2
-
     return models_for_export
 
 
@@ -294,33 +333,59 @@ def get_diffusion_models_for_export(
         `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `ExportConfig`]: A Dict containing the model and
         export configs for the different components of the model.
     """
+
     models_for_export = _get_submodels_for_export_diffusion(pipeline)
 
     # Text encoder
     if "text_encoder" in models_for_export:
+        text_encoder = models_for_export["text_encoder"]
         text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=pipeline.text_encoder,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
+            model=text_encoder, exporter=exporter, library_name="diffusers", task="feature-extraction"
         )
         text_encoder_export_config = text_encoder_config_constructor(
-            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+            text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
         )
         models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config)
 
+    # Text encoder 2
+    if "text_encoder_2" in models_for_export:
+        text_encoder_2 = models_for_export["text_encoder_2"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_2, exporter=exporter, library_name="diffusers", task="feature-extraction"
+        )
+        export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config)
+
+    # Text encoder 3
+    if "text_encoder_3" in models_for_export:
+        text_encoder_3 = models_for_export["text_encoder_3"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_3, exporter=exporter, library_name="diffusers", task="feature-extraction"
+        )
+        export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["text_encoder_3"] = (models_for_export["text_encoder_3"], export_config)
+
     # U-NET
-    export_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=pipeline.unet,
-        exporter=exporter,
-        library_name="diffusers",
-        task="semantic-segmentation",
-        model_type="unet",
-    )
-    unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["unet"] = (models_for_export["unet"], unet_export_config)
+    if "unet" in models_for_export:
+        unet = models_for_export["unet"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=unet, exporter=exporter, library_name="diffusers", task="semantic-segmentation"
+        )
+        unet_export_config = export_config_constructor(unet.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["unet"] = (models_for_export["unet"], unet_export_config)
 
-    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    # Transformer
+    if "transformer" in models_for_export:
+        transformer = models_for_export["transformer"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=transformer, exporter=exporter, library_name="diffusers", task="semantic-segmentation"
+        )
+        transformer_export_config = export_config_constructor(
+            transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        models_for_export["transformer"] = (models_for_export["transformer"], transformer_export_config)
+
+    # VAE Encoder
     vae_encoder = models_for_export["vae_encoder"]
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_encoder,
@@ -329,10 +394,12 @@ def get_diffusion_models_for_export(
         task="semantic-segmentation",
         model_type="vae-encoder",
     )
-    vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["vae_encoder"] = (vae_encoder, vae_export_config)
+    vae_encoder_export_config = vae_config_constructor(
+        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
 
-    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    # VAE Decoder
     vae_decoder = models_for_export["vae_decoder"]
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_decoder,
@@ -341,21 +408,10 @@ def get_diffusion_models_for_export(
         task="semantic-segmentation",
         model_type="vae-decoder",
     )
-    vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["vae_decoder"] = (vae_decoder, vae_export_config)
-
-    if "text_encoder_2" in models_for_export:
-        export_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=pipeline.text_encoder_2,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
-            model_type="clip-text-with-projection",
-        )
-        export_config = export_config_constructor(
-            pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype
-        )
-        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config)
+    vae_decoder_export_config = vae_config_constructor(
+        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
 
     return models_for_export
 
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 4e25a436909..f3f1535fd45 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -74,33 +74,51 @@
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     _import_structure[".utils.dummy_diffusers_objects"] = [
-        "ORTStableDiffusionPipeline",
+        "ORTDiffusionPipeline",
+        "ORTPipelineForText2Image",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        # flux
+        "ORTFluxPipeline",
+        # lcm
+        "ORTLatentConsistencyModelImg2ImgPipeline",
+        "ORTLatentConsistencyModelPipeline",
+        # sd3
+        "ORTStableDiffusion3Img2ImgPipeline",
+        "ORTStableDiffusion3InpaintPipeline",
+        "ORTStableDiffusion3Pipeline",
+        # sd
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
-        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionPipeline",
+        # xl
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTStableDiffusionXLInpaintPipeline",
-        "ORTLatentConsistencyModelPipeline",
-        "ORTLatentConsistencyModelImg2ImgPipeline",
-        "ORTPipelineForImage2Image",
-        "ORTPipelineForInpainting",
-        "ORTPipelineForText2Image",
-        "ORTDiffusionPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
-        "ORTStableDiffusionPipeline",
+        "ORTDiffusionPipeline",
+        "ORTPipelineForText2Image",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        # flux
+        "ORTFluxPipeline",
+        # lcm
+        "ORTLatentConsistencyModelImg2ImgPipeline",
+        "ORTLatentConsistencyModelPipeline",
+        # sd3
+        "ORTStableDiffusion3Img2ImgPipeline",
+        "ORTStableDiffusion3InpaintPipeline",
+        "ORTStableDiffusion3Pipeline",
+        # sd
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
-        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionPipeline",
+        # xl
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTStableDiffusionXLInpaintPipeline",
-        "ORTLatentConsistencyModelImg2ImgPipeline",
-        "ORTLatentConsistencyModelPipeline",
-        "ORTPipelineForImage2Image",
-        "ORTPipelineForInpainting",
-        "ORTPipelineForText2Image",
-        "ORTDiffusionPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 
 
@@ -151,30 +169,52 @@
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_diffusers_objects import (
+            # generic entrypoint
             ORTDiffusionPipeline,
+            # flux
+            ORTFluxPipeline,
+            # lcm
             ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
+            # task-specific entrypoints
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
+            # sd3
+            ORTStableDiffusion3Img2ImgPipeline,
+            ORTStableDiffusion3InpaintPipeline,
+            ORTStableDiffusion3Pipeline,
+            # sd
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            # xl
             ORTStableDiffusionXLImg2ImgPipeline,
             ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
         )
     else:
         from .modeling_diffusion import (
+            # generic entrypoint
             ORTDiffusionPipeline,
+            # flux
+            ORTFluxPipeline,
+            # lcm
             ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
+            # task-specific entrypoints
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
+            # sd3
+            ORTStableDiffusion3Img2ImgPipeline,
+            ORTStableDiffusion3InpaintPipeline,
+            ORTStableDiffusion3Pipeline,
+            # sd
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            # xl
             ORTStableDiffusionXLImg2ImgPipeline,
             ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 3899a7b36b6..79d302be449 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -57,7 +57,9 @@
 from ..onnx.utils import _get_model_external_data_paths
 from ..utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+    DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
+    DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
@@ -76,7 +78,7 @@
 if check_if_diffusers_greater("0.25.0"):
     from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 else:
-    from diffusers.models.vae import DiagonalGaussianDistribution
+    from diffusers.models.vae import DiagonalGaussianDistribution  # type: ignore
 
 
 logger = logging.getLogger(__name__)
@@ -92,15 +94,18 @@ class ORTDiffusionPipeline(ORTModel, DiffusionPipeline):
     def __init__(
         self,
         scheduler: "SchedulerMixin",
-        unet_session: ort.InferenceSession,
         vae_decoder_session: ort.InferenceSession,
         # optional pipeline models
+        unet_session: Optional[ort.InferenceSession] = None,
+        transformer_session: Optional[ort.InferenceSession] = None,
         vae_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_2_session: Optional[ort.InferenceSession] = None,
+        text_encoder_3_session: Optional[ort.InferenceSession] = None,
         # optional pipeline submodels
         tokenizer: Optional["CLIPTokenizer"] = None,
         tokenizer_2: Optional["CLIPTokenizer"] = None,
+        tokenizer_3: Optional["CLIPTokenizer"] = None,
         feature_extractor: Optional["CLIPFeatureExtractor"] = None,
         # stable diffusion xl specific arguments
         force_zeros_for_empty_prompt: bool = True,
@@ -111,16 +116,20 @@ def __init__(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        self.unet = ORTModelUnet(unet_session, self)
-        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self)
-        self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None
+        self.unet = ORTModelUnet(unet_session, self) if unet_session is not None else None
+        self.transformer = ORTModelTransformer(transformer_session, self) if transformer_session is not None else None
         self.text_encoder = (
             ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None
         )
         self.text_encoder_2 = (
             ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None
         )
+        self.text_encoder_3 = (
+            ORTModelTextEncoder(text_encoder_3_session, self) if text_encoder_3_session is not None else None
+        )
         # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API
+        self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None
+        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) if vae_decoder_session is not None else None
         self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder)
 
         # we allow passing these as torch models for now
@@ -130,18 +139,22 @@ def __init__(
         self.scheduler = scheduler
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
+        self.tokenizer_3 = tokenizer_3
         self.feature_extractor = feature_extractor
 
         all_pipeline_init_args = {
             "vae": self.vae,
             "unet": self.unet,
+            "transformer": self.transformer,
             "text_encoder": self.text_encoder,
             "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_3,
             "safety_checker": self.safety_checker,
             "image_encoder": self.image_encoder,
             "scheduler": self.scheduler,
             "tokenizer": self.tokenizer,
             "tokenizer_2": self.tokenizer_2,
+            "tokenizer_3": self.tokenizer_3,
             "feature_extractor": self.feature_extractor,
             "requires_aesthetics_score": requires_aesthetics_score,
             "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt,
@@ -157,7 +170,10 @@ def __init__(
 
         # inits ort specific attributes
         self.shared_attributes_init(
-            model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs
+            model=unet_session if unet_session is not None else transformer_session,
+            use_io_binding=use_io_binding,
+            model_save_dir=model_save_dir,
+            **kwargs,
         )
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
@@ -165,10 +181,12 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
 
         models_to_save_paths = {
             (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER),
+            (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER),
             (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER),
             (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER),
             (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER),
             (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER),
+            (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER),
         }
         for model, save_path in models_to_save_paths:
             if model is not None:
@@ -192,6 +210,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             self.tokenizer.save_pretrained(save_directory / "tokenizer")
         if self.tokenizer_2 is not None:
             self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")
+        if self.tokenizer_3 is not None:
+            self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3")
         if self.feature_extractor is not None:
             self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
 
@@ -208,10 +228,12 @@ def _from_pretrained(
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
         token: Optional[Union[bool, str]] = None,
         unet_file_name: str = ONNX_WEIGHTS_NAME,
+        transformer_file_name: str = ONNX_WEIGHTS_NAME,
         vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
         vae_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME,
+        text_encoder_3_file_name: str = ONNX_WEIGHTS_NAME,
         use_io_binding: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
         provider_options: Optional[Dict[str, Any]] = None,
@@ -230,10 +252,12 @@ def _from_pretrained(
             allow_patterns.update(
                 {
                     unet_file_name,
+                    transformer_file_name,
                     vae_decoder_file_name,
                     vae_encoder_file_name,
                     text_encoder_file_name,
                     text_encoder_2_file_name,
+                    text_encoder_3_file_name,
                     SCHEDULER_CONFIG_NAME,
                     cls.config_name,
                     CONFIG_NAME,
@@ -259,10 +283,12 @@ def _from_pretrained(
 
         model_paths = {
             "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
+            "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name,
             "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
             "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
             "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
             "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
+            "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name,
         }
 
         sessions = {}
@@ -276,7 +302,7 @@ def _from_pretrained(
                 )
 
         submodels = {}
-        for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}:
+        for submodel in {"scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"}:
             if kwargs.get(submodel, None) is not None:
                 submodels[submodel] = kwargs.pop(submodel)
             elif config.get(submodel, (None, None))[0] is not None:
@@ -385,17 +411,24 @@ def to(self, device: Union[torch.device, str, int]):
         if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
             return self
 
-        self.unet.session.set_providers([provider], provider_options=[provider_options])
         self.vae_decoder.session.set_providers([provider], provider_options=[provider_options])
 
+        if self.unet is not None:
+            self.unet.session.set_providers([provider], provider_options=[provider_options])
+        if self.transformer is not None:
+            self.transformer.session.set_providers([provider], provider_options=[provider_options])
         if self.vae_encoder is not None:
             self.vae_encoder.session.set_providers([provider], provider_options=[provider_options])
         if self.text_encoder is not None:
             self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
         if self.text_encoder_2 is not None:
             self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options])
+        if self.text_encoder_3 is not None:
+            self.text_encoder_3.session.set_providers([provider], provider_options=[provider_options])
 
-        self.providers = self.unet.session.get_providers()
+        self.providers = (
+            self.unet.session.get_providers() if self.unet is not None else self.transformer.session.get_providers()
+        )
         self._device = device
 
         return self
@@ -412,8 +445,10 @@ def components(self) -> Dict[str, Any]:
         components = {
             "vae": self.vae,
             "unet": self.unet,
+            "transformer": self.transformer,
             "text_encoder": self.text_encoder,
             "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_3,
             "safety_checker": self.safety_checker,
             "image_encoder": self.image_encoder,
         }
@@ -443,9 +478,13 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionP
 
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+
         self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()}
         self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()}
 
+        self.input_shapes = {input_key.name: input_key.shape for input_key in self.session.get_inputs()}
+        self.output_shapes = {output_key.name: output_key.shape for output_key in self.session.get_outputs()}
+
         config_file_path = Path(session._model_path).parent / self.config_name
         if not config_file_path.is_file():
             # config is mandatory for the model part to be used for inference
@@ -543,13 +582,18 @@ def __init__(self, *args, **kwargs):
             )
             self.register_to_config(time_cond_proj_dim=None)
 
+        if len(self.input_shapes["timestep"]) > 0:
+            logger.warning(
+                "The exported unet onnx model expects a non scalar timestep input. "
+                "We will have to unsqueeze the timestep input at each iteration which might be inefficient. "
+                "Please re-export the pipeline with newer version of optimum and diffusers to avoid this warning."
+            )
+
     def forward(
         self,
         sample: Union[np.ndarray, torch.Tensor],
         timestep: Union[np.ndarray, torch.Tensor],
         encoder_hidden_states: Union[np.ndarray, torch.Tensor],
-        text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
-        time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
         timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, Any]] = None,
@@ -557,15 +601,13 @@ def forward(
     ):
         use_torch = isinstance(sample, torch.Tensor)
 
-        if len(timestep.shape) == 0:
+        if len(self.input_shapes["timestep"]) > 0:
             timestep = timestep.unsqueeze(0)
 
         model_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
-            "text_embeds": text_embeds,
-            "time_ids": time_ids,
             "timestep_cond": timestep_cond,
             **(cross_attention_kwargs or {}),
             **(added_cond_kwargs or {}),
@@ -581,6 +623,42 @@ def forward(
         return ModelOutput(**model_outputs)
 
 
+class ORTModelTransformer(ORTPipelinePart):
+    def forward(
+        self,
+        hidden_states: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        pooled_projections: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        guidance: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        txt_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        img_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(hidden_states, torch.Tensor)
+
+        model_inputs = {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "timestep": timestep,
+            "guidance": guidance,
+            "txt_ids": txt_ids,
+            "img_ids": img_ids,
+            **(joint_attention_kwargs or {}),
+        }
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
 class ORTModelTextEncoder(ORTPipelinePart):
     def forward(
         self,
@@ -599,11 +677,13 @@ def forward(
 
         if output_hidden_states:
             model_outputs["hidden_states"] = []
-            for i in range(self.config.num_hidden_layers):
+            num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers
+            for i in range(num_layers):
                 model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}"))
             model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state"))
         else:
-            for i in range(self.config.num_hidden_layers):
+            num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers
+            for i in range(num_layers):
                 model_outputs.pop(f"hidden_states.{i}", None)
 
         if return_dict:
@@ -620,7 +700,7 @@ def __init__(self, *args, **kwargs):
         if not hasattr(self.config, "scaling_factor"):
             logger.warning(
                 "The `scaling_factor` attribute is missing from the VAE encoder configuration. "
-                "Please re-export the model with newer version of optimum and diffusers."
+                "Please re-export the model with newer version of optimum and diffusers to avoid this warning."
             )
             self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
 
@@ -660,7 +740,7 @@ def __init__(self, *args, **kwargs):
         if not hasattr(self.config, "scaling_factor"):
             logger.warning(
                 "The `scaling_factor` attribute is missing from the VAE decoder configuration. "
-                "Please re-export the model with newer version of optimum and diffusers."
+                "Please re-export the model with newer version of optimum and diffusers to avoid this warning."
             )
             self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
 
@@ -871,6 +951,80 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi
     auto_model_class = LatentConsistencyModelImg2ImgPipeline
 
 
+class ORTUnavailablePipeline:
+    MIN_VERSION = None
+
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"The pipeline {self.__class__.__name__} is not available in the current version of `diffusers`. "
+            f"Please upgrade `diffusers` to {self.MIN_VERSION} or later."
+        )
+
+
+if check_if_diffusers_greater("0.29.0"):
+    from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3Pipeline(ORTDiffusionPipeline, StableDiffusion3Pipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Pipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusion3Pipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "text-to-image"
+        auto_model_class = StableDiffusion3Pipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3Img2ImgPipeline(ORTDiffusionPipeline, StableDiffusion3Img2ImgPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Img2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusion3Img2ImgPipeline).
+        """
+
+        main_input_name = "image"
+        export_feature = "image-to-image"
+        auto_model_class = StableDiffusion3Img2ImgPipeline
+
+else:
+
+    class ORTStableDiffusion3Pipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.29.0"
+
+    class ORTStableDiffusion3Img2ImgPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.29.0"
+
+
+if check_if_diffusers_greater("0.30.0"):
+    from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3InpaintPipeline(ORTDiffusionPipeline, StableDiffusion3InpaintPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3InpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusion3InpaintPipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "inpainting"
+        auto_model_class = StableDiffusion3InpaintPipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTFluxPipeline(ORTDiffusionPipeline, FluxPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.FluxPipeline](https://huggingface.co/docs/diffusers/api/pipelines/flux/text2img#diffusers.FluxPipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "text-to-image"
+        auto_model_class = FluxPipeline
+
+else:
+
+    class ORTStableDiffusion3InpaintPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.30.0"
+
+    class ORTFluxPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.30.0"
+
+
 SUPPORTED_ORT_PIPELINES = [
     ORTStableDiffusionPipeline,
     ORTStableDiffusionImg2ImgPipeline,
@@ -880,6 +1034,10 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi
     ORTStableDiffusionXLInpaintPipeline,
     ORTLatentConsistencyModelPipeline,
     ORTLatentConsistencyModelImg2ImgPipeline,
+    ORTStableDiffusion3Pipeline,
+    ORTStableDiffusion3Img2ImgPipeline,
+    ORTStableDiffusion3InpaintPipeline,
+    ORTFluxPipeline,
 ]
 
 
@@ -897,23 +1055,27 @@ def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tr
 
 ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
+        ("flux", ORTFluxPipeline),
+        ("latent-consistency", ORTLatentConsistencyModelPipeline),
         ("stable-diffusion", ORTStableDiffusionPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3Pipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
-        ("latent-consistency", ORTLatentConsistencyModelPipeline),
     ]
 )
 
 ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
+        ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline),
         ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3Img2ImgPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
-        ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline),
     ]
 )
 
 ORT_INPAINT_PIPELINES_MAPPING = OrderedDict(
     [
         ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3InpaintPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline),
     ]
 )
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index db7d1f6975d..40d93d298e4 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -16,7 +16,9 @@
 from .constant import (
     CONFIG_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+    DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
+    DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
@@ -52,6 +54,8 @@
     DummyCodegenDecoderTextInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
+    DummyFluxTransformerTextInputGenerator,
+    DummyFluxTransformerVisionInputGenerator,
     DummyInputGenerator,
     DummyIntGenerator,
     DummyLabelsGenerator,
@@ -63,6 +67,9 @@
     DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
+    DummyTransformerTextInputGenerator,
+    DummyTransformerTimestepInputGenerator,
+    DummyTransformerVisionInputGenerator,
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
diff --git a/optimum/utils/constant.py b/optimum/utils/constant.py
index 4497b5246d4..eb7a67e9ece 100644
--- a/optimum/utils/constant.py
+++ b/optimum/utils/constant.py
@@ -15,8 +15,10 @@
 
 CONFIG_NAME = "config.json"
 DIFFUSION_MODEL_UNET_SUBFOLDER = "unet"
-DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
+DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
 DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder"
 DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder"
+DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
 DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER = "text_encoder_2"
+DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3"
 ONNX_WEIGHTS_NAME = "model.onnx"
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index 35d1ffe9fc7..ff8b587e19f 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -15,6 +15,50 @@
 from .import_utils import DummyObject, requires_backends
 
 
+class ORTDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
 class ORTStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
@@ -70,6 +114,17 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
+class ORTStableDiffusionXLInpaintPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
 class ORTLatentConsistencyModelPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
@@ -81,7 +136,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
-class ORTDiffusionPipeline(metaclass=DummyObject):
+class ORTLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
     def __init__(self, *args, **kwargs):
@@ -92,7 +147,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
-class ORTPipelineForText2Image(metaclass=DummyObject):
+class ORTStableDiffusion3Pipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
     def __init__(self, *args, **kwargs):
@@ -103,7 +158,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
-class ORTPipelineForImage2Image(metaclass=DummyObject):
+class ORTStableDiffusion3Img2ImgPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
     def __init__(self, *args, **kwargs):
@@ -114,7 +169,18 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
-class ORTPipelineForInpainting(metaclass=DummyObject):
+class ORTStableDiffusion3InpaintPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTFluxPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
     def __init__(self, *args, **kwargs):
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index dac14a38114..148072aa0b4 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -22,7 +22,7 @@
 import numpy as np
 from transformers.utils import is_tf_available, is_torch_available
 
-from ..utils import check_if_transformers_greater
+from ..utils import check_if_diffusers_greater, check_if_transformers_greater
 from .normalized_config import (
     NormalizedConfig,
     NormalizedEncoderDecoderConfig,
@@ -36,7 +36,7 @@
     import torch
 
 if is_tf_available():
-    import tensorflow as tf
+    import tensorflow as tf  # type: ignore
 
 
 def check_framework_is_available(func):
@@ -871,8 +871,8 @@ def __init__(
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "timestep":
-            shape = [self.batch_size]
-            return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype)
+            shape = []  # a scalar with no dimension (it can be int or float depending on the sd architecture)
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
 
         if input_name == "text_embeds":
             dim = self.text_encoder_projection_dim
@@ -1411,3 +1411,76 @@ def generate(
         float_dtype: str = "fp32",
     ):
         return self.random_int_tensor(shape=(1,), min_value=20, max_value=22, framework=framework, dtype=int_dtype)
+
+
+class DummyTransformerTimestepInputGenerator(DummyTimestepInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("timestep",)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "timestep":
+            shape = [self.batch_size]  # With transformer diffusers, timestep is a 1D tensor
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyTransformerVisionInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("hidden_states",)
+
+
+class DummyTransformerTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "encoder_hidden_states",
+        "pooled_projection",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "encoder_hidden_states":
+            return super().generate(input_name, framework, int_dtype, float_dtype)[0]
+
+        elif input_name == "pooled_projections":
+            return self.random_float_tensor(
+                [self.batch_size, self.normalized_config.projection_size], framework=framework, dtype=float_dtype
+            )
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyFluxTransformerVisionInputGenerator(DummyTransformerVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "hidden_states",
+        "img_ids",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "hidden_states":
+            shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels]
+            return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+        elif input_name == "img_ids":
+            shape = (
+                [(self.height // 2) * (self.width // 2), 3]
+                if check_if_diffusers_greater("0.31.0")
+                else [self.batch_size, (self.height // 2) * (self.width // 2), 3]
+            )
+            return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "encoder_hidden_states",
+        "pooled_projections",
+        "txt_ids",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "txt_ids":
+            shape = (
+                [self.sequence_length, 3]
+                if check_if_diffusers_greater("0.31.0")
+                else [self.batch_size, self.sequence_length, 3]
+            )
+            return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index ccccb5510bf..31059c403de 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -297,9 +297,11 @@
 }
 
 PYTORCH_DIFFUSION_MODEL = {
+    "flux": "optimum-internal-testing/tiny-random-flux",
+    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
-    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
 }
 
 PYTORCH_TIMM_MODEL = {
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index 7671d6cd2e6..88288547c95 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -299,7 +299,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device
         with TemporaryDirectory() as tmpdirname:
             _, onnx_outputs = export_models(
                 models_and_onnx_configs=models_and_onnx_configs,
-                opset=14,
                 output_dir=Path(tmpdirname),
                 device=device,
             )
@@ -307,7 +306,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device
                 models_and_onnx_configs=models_and_onnx_configs,
                 onnx_named_outputs=onnx_outputs,
                 output_dir=Path(tmpdirname),
-                atol=1e-4,
                 use_subprocess=False,
             )
 
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index 956566f0e1f..07f90e8984e 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -34,6 +34,7 @@
     ORTPipelineForInpainting,
     ORTPipelineForText2Image,
 )
+from optimum.utils import check_if_transformers_greater
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
 
@@ -71,7 +72,29 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type=
 
 
 class ORTPipelineForText2ImageTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3", "flux"]
+
+    NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        CALLBACK_SUPPORTED_ARCHITECTURES += ["flux"]
 
     ORTMODEL_CLASS = ORTPipelineForText2Image
     AUTOMODEL_CLASS = AutoPipelineForText2Image
@@ -120,8 +143,8 @@ def test_num_images_per_prompt(self, model_arch: str):
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
 
         for batch_size in [1, 3]:
-            for height in [64, 128]:
-                for width in [64, 128]:
+            for height in [16, 32]:
+                for width in [16, 32]:
                     for num_images_per_prompt in [1, 3]:
                         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
                         outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
@@ -142,12 +165,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
 
-            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -164,6 +187,7 @@ def __init__(self):
             def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
+                return kwargs
 
         ort_callback = Callback()
         auto_callback = Callback()
@@ -171,9 +195,8 @@ def __call__(self, *args, **kwargs) -> None:
         ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        # callback_steps=1 to trigger callback every step
-        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
-        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
 
         self.assertTrue(ort_callback.has_been_called)
         self.assertTrue(auto_callback.has_been_called)
@@ -200,10 +223,20 @@ def test_shape(self, model_arch: str):
             elif output_type == "pt":
                 self.assertEqual(outputs.shape, (batch_size, 3, height, width))
             else:
-                self.assertEqual(
-                    outputs.shape,
-                    (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
-                )
+                expected_height = height // pipeline.vae_scale_factor
+                expected_width = width // pipeline.vae_scale_factor
+
+                if model_arch == "flux":
+                    channels = pipeline.transformer.config.in_channels
+                    expected_shape = (batch_size, expected_height * expected_width, channels)
+                elif model_arch == "stable-diffusion-3":
+                    out_channels = pipeline.transformer.config.out_channels
+                    expected_shape = (batch_size, out_channels, expected_height, expected_width)
+                else:
+                    out_channels = pipeline.unet.config.out_channels
+                    expected_shape = (batch_size, out_channels, expected_height, expected_width)
+
+                self.assertEqual(outputs.shape, expected_shape)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -224,45 +257,22 @@ def test_image_reproducibility(self, model_arch: str):
             self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
             np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @parameterized.expand(NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES)
     def test_negative_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 
         height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["negative_prompt"] = ["This is a negative prompt"] * batch_size
 
-        negative_prompt = ["This is a negative prompt"]
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images
-        prompt = inputs.pop("prompt")
-
-        if model_arch == "stable-diffusion-xl":
-            (
-                inputs["prompt_embeds"],
-                inputs["negative_prompt_embeds"],
-                inputs["pooled_prompt_embeds"],
-                inputs["negative_pooled_prompt_embeds"],
-            ) = pipeline.encode_prompt(
-                prompt=prompt,
-                num_images_per_prompt=1,
-                device=torch.device("cpu"),
-                do_classifier_free_guidance=True,
-                negative_prompt=negative_prompt,
-            )
-        else:
-            inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt(
-                prompt=prompt,
-                num_images_per_prompt=1,
-                device=torch.device("cpu"),
-                do_classifier_free_guidance=True,
-                negative_prompt=negative_prompt,
-            )
-
-        images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images
-
-        np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2)
+        ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(
         grid_parameters(
@@ -285,9 +295,9 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device.type, "cuda")
 
         outputs = pipeline(**inputs).images
-
         self.assertIsInstance(outputs, np.ndarray)
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
@@ -326,7 +336,19 @@ def test_safety_checker(self, model_arch: str):
 
 
 class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
 
     AUTOMODEL_CLASS = AutoPipelineForImage2Image
     ORTMODEL_CLASS = ORTPipelineForImage2Image
@@ -373,14 +395,14 @@ def test_num_images_per_prompt(self, model_arch: str):
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
 
         for batch_size in [1, 3]:
-            for height in [64, 128]:
-                for width in [64, 128]:
+            for height in [16, 32]:
+                for width in [16, 32]:
                     for num_images_per_prompt in [1, 3]:
                         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
                         outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
                         self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -398,15 +420,16 @@ def __init__(self):
             def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
+                return kwargs
 
         ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
         ort_callback = Callback()
         auto_callback = Callback()
-        # callback_steps=1 to trigger callback every step
-        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
-        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
 
         self.assertTrue(ort_callback.has_been_called)
         self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
@@ -434,9 +457,19 @@ def test_shape(self, model_arch: str):
                 elif output_type == "pt":
                     self.assertEqual(outputs.shape, (batch_size, 3, height, width))
                 else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
                     self.assertEqual(
                         outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
                     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -454,10 +487,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
 
-            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -541,7 +574,17 @@ def test_safety_checker(self, model_arch: str):
 
 
 class ORTPipelineForInpaintingTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+    ]
 
     AUTOMODEL_CLASS = AutoPipelineForInpainting
     ORTMODEL_CLASS = ORTPipelineForInpainting
@@ -593,14 +636,14 @@ def test_num_images_per_prompt(self, model_arch: str):
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
 
         for batch_size in [1, 3]:
-            for height in [64, 128]:
-                for width in [64, 128]:
+            for height in [16, 32]:
+                for width in [16, 32]:
                     for num_images_per_prompt in [1, 3]:
                         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
                         outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
                         self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -618,15 +661,16 @@ def __init__(self):
             def __call__(self, *args, **kwargs) -> None:
                 self.has_been_called = True
                 self.number_of_steps += 1
+                return kwargs
 
         ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
         ort_callback = Callback()
         auto_callback = Callback()
-        # callback_steps=1 to trigger callback every step
-        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
-        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
 
         self.assertTrue(ort_callback.has_been_called)
         self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
@@ -654,9 +698,19 @@ def test_shape(self, model_arch: str):
                 elif output_type == "pt":
                     self.assertEqual(outputs.shape, (batch_size, 3, height, width))
                 else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
                     self.assertEqual(
                         outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
                     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -674,10 +728,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
 
-            ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2)
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
@@ -719,7 +773,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        self.assertEqual(pipeline.device, "cuda")
+        self.assertEqual(pipeline.device.type, "cuda")
 
         outputs = pipeline(**inputs).images
         self.assertIsInstance(outputs, np.ndarray)
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 84ac27029f9..c4340dcd8b6 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -143,7 +143,7 @@ class ORTModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.TEST_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad"
-        self.LOCAL_MODEL_PATH = "assets/onnx"
+        self.LOCAL_MODEL_PATH = "tests/assets/onnx"
         self.ONNX_MODEL_ID = "philschmid/distilbert-onnx"
         self.TINY_ONNX_MODEL_ID = "fxmarty/resnet-tiny-beans"
         self.FAIL_ONNX_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad"
diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py
index 34a9504f95a..cf451590fbd 100644
--- a/tests/onnxruntime/test_quantization.py
+++ b/tests/onnxruntime/test_quantization.py
@@ -42,10 +42,10 @@
 class ORTQuantizerTest(unittest.TestCase):
     LOAD_CONFIGURATION = {
         "local_asset": {
-            "model_or_path": "assets/onnx",
+            "model_or_path": "tests/assets/onnx",
         },
         "local_asset_different_name": {
-            "model_or_path": "assets/onnx",
+            "model_or_path": "tests/assets/onnx",
             "file_name": "different_name.onnx",
         },
         "ort_model_class": {
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 9f200e69b3d..ba8f6cc4abc 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -98,6 +98,7 @@
     },
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "flux": "optimum-internal-testing/tiny-random-flux",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
@@ -108,10 +109,10 @@
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
-    "levit": "hf-internal-testing/tiny-random-LevitModel",
     "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
+    "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
     "llama": "optimum-internal-testing/tiny-random-llama",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
@@ -143,6 +144,7 @@
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
     "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",

From d2a5a6aa2adbe9561527a85c4a4947a6d7fcfa58 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 21 Nov 2024 15:03:14 +0100
Subject: [PATCH 39/50] Remove datasets as required dependency (#2087)

* remove datasets required dependency

* install datasets when needed

* add datasets installed when needed

* style

* add require dataset

* divide datasets tests

* import datasets only when needed
---
 .github/workflows/dev_test_benckmark.yml |  2 +-
 .github/workflows/test_benckmark.yml     |  2 +-
 .github/workflows/test_utils.yml         | 11 ++++++++++-
 optimum/gptq/data.py                     | 16 ++++++++++++++-
 optimum/gptq/quantizer.py                |  2 +-
 optimum/onnxruntime/configuration.py     | 15 +++++++++-----
 optimum/onnxruntime/model.py             |  9 ++++++---
 optimum/onnxruntime/quantization.py      | 17 ++++++++++------
 optimum/onnxruntime/runs/calibrator.py   | 10 ++++++----
 optimum/runs_base.py                     |  8 +++++---
 optimum/utils/__init__.py                |  1 +
 optimum/utils/import_utils.py            | 12 ++++++++++++
 optimum/utils/preprocessing/base.py      | 19 +++++++++++++-----
 optimum/utils/testing_utils.py           |  5 +++++
 pyproject.toml                           |  1 +
 setup.py                                 |  3 ---
 tests/utils/test_task_processors.py      | 25 +++++++++++++++++++++++-
 17 files changed, 123 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml
index a898d288625..381197b129a 100644
--- a/.github/workflows/dev_test_benckmark.yml
+++ b/.github/workflows/dev_test_benckmark.yml
@@ -23,7 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         pip install wheel
-        pip install .[tests,onnxruntime,benchmark]
+        pip install .[tests,onnxruntime,benchmark] datasets
         pip install -U git+https://github.com/huggingface/evaluate
         pip install -U git+https://github.com/huggingface/diffusers
         pip install -U git+https://github.com/huggingface/transformers
diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml
index e859e845d64..fe7df1a20cc 100644
--- a/.github/workflows/test_benckmark.yml
+++ b/.github/workflows/test_benckmark.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install wheel
-          pip install .[tests,onnxruntime,benchmark]
+          pip install .[tests,onnxruntime,benchmark] datasets
       - name: Test with unittest
         run: |
           python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
index 0126b023c60..bbe00e62841 100644
--- a/.github/workflows/test_utils.yml
+++ b/.github/workflows/test_utils.yml
@@ -37,4 +37,13 @@ jobs:
       - name: Test with pytest
         working-directory: tests
         run: |
-          python -m pytest -s -vvvv utils
+          pytest utils -s -n auto -m "not datasets_test" --durations=0
+
+      - name: Install datasets
+        run: |
+          pip install datasets
+
+      - name: Tests needing datasets
+        working-directory: tests
+        run: |
+          pytest utils -s -n auto -m "datasets_test" --durations=0
\ No newline at end of file
diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py
index b8734da478e..7e5fc0b43db 100644
--- a/optimum/gptq/data.py
+++ b/optimum/gptq/data.py
@@ -18,7 +18,12 @@
 
 import numpy as np
 import torch
-from datasets import load_dataset
+
+from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
+
+
+if is_datasets_available():
+    from datasets import load_dataset
 
 
 """
@@ -113,6 +118,9 @@ def pad_block(block, pads):
 
 
 def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2"))
+
     if split == "train":
         data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     elif split == "validation":
@@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai
 
 
 def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
@@ -157,6 +168,9 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
 
 
 def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 949d4d260df..849d8821ebf 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -88,7 +88,7 @@ def __init__(
             dataset (`Union[List[str], str, Any]`, defaults to `None`):
                 The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
                 (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
-                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
+                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new'].
             group_size (int, defaults to 128):
                 The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
             damp_percent (`float`, defaults to `0.1`):
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
index 2e3d9f32d6a..adc1984795a 100644
--- a/optimum/onnxruntime/configuration.py
+++ b/optimum/onnxruntime/configuration.py
@@ -18,9 +18,8 @@
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from datasets import Dataset
 from packaging.version import Version, parse
 
 from onnxruntime import __version__ as ort_version
@@ -33,6 +32,10 @@
 from ..utils import logging
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.get_logger(__name__)
 
 # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel"
@@ -117,7 +120,9 @@ def create_calibrator(
 
 class AutoCalibrationConfig:
     @staticmethod
-    def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig:
+    def minmax(
+        dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01
+    ) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
@@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f
 
     @staticmethod
     def entropy(
-        dataset: Dataset,
+        dataset: "Dataset",
         num_bins: int = 128,
         num_quantized_bins: int = 128,
     ) -> CalibrationConfig:
@@ -188,7 +193,7 @@ def entropy(
         )
 
     @staticmethod
-    def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
+    def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index caa662f3824..4182abc925f 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -14,10 +14,9 @@
 
 import logging
 import os
-from typing import Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
-from datasets import Dataset
 from transformers import EvalPrediction
 from transformers.trainer_pt_utils import nested_concat
 from transformers.trainer_utils import EvalLoopOutput
@@ -25,6 +24,10 @@
 from onnxruntime import InferenceSession
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -59,7 +62,7 @@ def __init__(
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
         self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
 
-    def evaluation_loop(self, dataset: Dataset):
+    def evaluation_loop(self, dataset: "Dataset"):
         """
         Run evaluation and returns metrics and predictions.
 
diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
index f637916dcd2..054a2310a6b 100644
--- a/optimum/onnxruntime/quantization.py
+++ b/optimum/onnxruntime/quantization.py
@@ -21,7 +21,6 @@
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import onnx
-from datasets import Dataset, load_dataset
 from packaging.version import Version, parse
 from transformers import AutoConfig
 
@@ -29,6 +28,7 @@
 from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType
 from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
 from onnxruntime.quantization.qdq_quantizer import QDQQuantizer
+from optimum.utils.import_utils import requires_backends
 
 from ..quantization_base import OptimumQuantizer
 from ..utils.save_utils import maybe_save_preprocessors
@@ -40,6 +40,7 @@
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset
     from transformers import PretrainedConfig
 
 LOGGER = logging.getLogger(__name__)
@@ -48,7 +49,7 @@
 class ORTCalibrationDataReader(CalibrationDataReader):
     __slots__ = ["batch_size", "dataset", "_dataset_iter"]
 
-    def __init__(self, dataset: Dataset, batch_size: int = 1):
+    def __init__(self, dataset: "Dataset", batch_size: int = 1):
         if dataset is None:
             raise ValueError("Provided dataset is None.")
 
@@ -158,7 +159,7 @@ def from_pretrained(
 
     def fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -212,7 +213,7 @@ def fit(
 
     def partial_fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -428,7 +429,7 @@ def get_calibration_dataset(
         seed: int = 2016,
         use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
-    ) -> Dataset:
+    ) -> "Dataset":
         """
         Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step.
 
@@ -474,6 +475,10 @@ def get_calibration_dataset(
                 "provided."
             )
 
+        requires_backends(self, ["datasets"])
+
+        from datasets import load_dataset
+
         calib_dataset = load_dataset(
             dataset_name,
             name=dataset_config_name,
@@ -492,7 +497,7 @@ def get_calibration_dataset(
 
         return self.clean_calibration_dataset(processed_calib_dataset)
 
-    def clean_calibration_dataset(self, dataset: Dataset) -> Dataset:
+    def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset":
         model = onnx.load(self.onnx_model_path)
         model_inputs = {input.name for input in model.graph.input}
         ignored_columns = list(set(dataset.column_names) - model_inputs)
diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py
index c493a943747..bfdcd64d92e 100644
--- a/optimum/onnxruntime/runs/calibrator.py
+++ b/optimum/onnxruntime/runs/calibrator.py
@@ -1,6 +1,4 @@
-from typing import Dict, List
-
-from datasets import Dataset
+from typing import TYPE_CHECKING, Dict, List
 
 from ...runs_base import Calibrator
 from .. import ORTQuantizer
@@ -9,10 +7,14 @@
 from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 class OnnxRuntimeCalibrator(Calibrator):
     def __init__(
         self,
-        calibration_dataset: Dataset,
+        calibration_dataset: "Dataset",
         quantizer: ORTQuantizer,
         model_path: str,
         qconfig: QuantizationConfig,
diff --git a/optimum/runs_base.py b/optimum/runs_base.py
index 3a1d164c602..dadd445818f 100644
--- a/optimum/runs_base.py
+++ b/optimum/runs_base.py
@@ -2,13 +2,12 @@
 import subprocess
 from contextlib import contextmanager
 from time import perf_counter_ns
-from typing import Set
+from typing import TYPE_CHECKING, Set
 
 import numpy as np
 import optuna
 import torch
 import transformers
-from datasets import Dataset
 from tqdm import trange
 
 from . import version as optimum_version
@@ -21,6 +20,9 @@
 from .utils.runs import RunConfig, cpu_info_command
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -34,7 +36,7 @@ def get_autoclass_name(task):
 
 class Calibrator:
     def __init__(
-        self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion
+        self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion
     ):
         self.calibration_dataset = calibration_dataset
         self.quantizer = quantizer
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 40d93d298e4..fb1794af49c 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -35,6 +35,7 @@
     check_if_transformers_greater,
     is_accelerate_available,
     is_auto_gptq_available,
+    is_datasets_available,
     is_diffusers_available,
     is_onnx_available,
     is_onnxruntime_available,
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 35a6294ab52..405e3815b33 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -69,6 +69,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _auto_gptq_available = _is_package_available("auto_gptq")
 _timm_available = _is_package_available("timm")
 _sentence_transformers_available = _is_package_available("sentence_transformers")
+_datasets_available = _is_package_available("datasets")
 
 torch_version = None
 if is_torch_available():
@@ -131,6 +132,10 @@ def is_sentence_transformers_available():
     return _sentence_transformers_available
 
 
+def is_datasets_available():
+    return _datasets_available
+
+
 def is_auto_gptq_available():
     if _auto_gptq_available:
         version_autogptq = version.parse(importlib_metadata.version("auto_gptq"))
@@ -230,6 +235,12 @@ def require_numpy_strictly_lower(package_version: str, message: str):
 -U transformers`. Please note that you may need to restart your runtime after installation.
 """
 
+DATASETS_IMPORT_ERROR = """
+{0} requires the datasets library but it was not found in your environment. You can install it with pip:
+`pip install datasets`. Please note that you may need to restart your runtime after installation.
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)),
@@ -245,6 +256,7 @@ def require_numpy_strictly_lower(package_version: str, message: str):
             "transformers_434",
             (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")),
         ),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
     ]
 )
 
diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py
index dc995ccc50b..7cfda13ba7d 100644
--- a/optimum/utils/preprocessing/base.py
+++ b/optimum/utils/preprocessing/base.py
@@ -20,15 +20,16 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
-from datasets import Dataset, DatasetDict
-from datasets import load_dataset as datasets_load_dataset
 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import BaseImageProcessor
 
+from optimum.utils.import_utils import requires_backends
+
 from .. import logging
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset, DatasetDict
     from transformers import PretrainedConfig
 
 
@@ -102,11 +103,14 @@ def create_dataset_processing_func(
 
     def prepare_dataset(
         self,
-        dataset: Union[DatasetDict, Dataset],
+        dataset: Union["DatasetDict", "Dataset"],
         data_keys: Dict[str, str],
         ref_keys: Optional[List[str]] = None,
         split: Optional[str] = None,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+        from datasets import Dataset
+
         if isinstance(dataset, Dataset) and split is not None:
             raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.")
         elif split is not None:
@@ -131,7 +135,12 @@ def load_dataset(
         num_samples: Optional[int] = None,
         shuffle: bool = False,
         **load_dataset_kwargs,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+
+        from datasets import Dataset, DatasetDict
+        from datasets import load_dataset as datasets_load_dataset
+
         dataset = datasets_load_dataset(path, **load_dataset_kwargs)
 
         if isinstance(dataset, DatasetDict) and load_smallest_split:
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 76fe9a05b13..88b1acdb780 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -28,6 +28,7 @@
 from . import (
     is_accelerate_available,
     is_auto_gptq_available,
+    is_datasets_available,
     is_diffusers_available,
     is_sentence_transformers_available,
     is_timm_available,
@@ -146,6 +147,10 @@ def require_sentence_transformers(test_case):
     return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case)
 
 
+def require_datasets(test_case):
+    return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case)
+
+
 def grid_parameters(
     parameters: Dict[str, Iterable[Any]],
     yield_dict: bool = False,
diff --git a/pyproject.toml b/pyproject.toml
index 99a0f1c85fa..17bcd90e066 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "rocm_ep_test",
     "tensorflow_test",
     "timm_test",
+    "datasets_test",
     "run_in_series",
     "run_slow",
     "accelerate_test",
diff --git a/setup.py b/setup.py
index 29f97b604e0..6736085943a 100644
--- a/setup.py
+++ b/setup.py
@@ -13,14 +13,11 @@
 
 
 REQUIRED_PKGS = [
-    "coloredlogs",
-    "sympy",
     "transformers>=4.29",
     "torch>=1.11",
     "packaging",
     "numpy",
     "huggingface_hub>=0.8.0",
-    "datasets",
 ]
 
 # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released
diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py
index 16567048073..1a9f352a79f 100644
--- a/tests/utils/test_task_processors.py
+++ b/tests/utils/test_task_processors.py
@@ -19,16 +19,21 @@
 from typing import TYPE_CHECKING, Any, Dict, Tuple, Union
 from unittest import TestCase
 
-from datasets import DatasetDict
+import pytest
 from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
 
+from optimum.utils.import_utils import is_datasets_available
 from optimum.utils.preprocessing import TaskProcessorsManager
+from optimum.utils.testing_utils import require_datasets
 
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedTokenizerBase
     from transformers.image_processing_utils import BaseImageProcessor
 
+if is_datasets_available():
+    from datasets import DatasetDict
+
 
 TEXT_MODEL_NAME = "bert-base-uncased"
 CONFIG = AutoConfig.from_pretrained(TEXT_MODEL_NAME)
@@ -122,6 +127,8 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre
         )
         self.assertDictEqual(preprocessor_kwargs, clone)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_unallowed_data_keys(self):
         task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)(
             self.CONFIG, self.PREPROCESSOR
@@ -188,15 +195,23 @@ def _test_load_dataset(
 
         return dataset
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset(self):
         return self._test_load_dataset(False, False, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_by_guessing_data_keys(self):
         return self._test_load_dataset(False, True, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_and_only_keep_necessary_columns(self):
         return self._test_load_dataset(False, False, True)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         return self._test_load_dataset(True, False, False)
 
@@ -207,6 +222,8 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -223,6 +240,8 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -232,6 +251,8 @@ def test_load_dataset_with_max_length(self):
         input_ids = dataset[0]["input_ids"]
         self.assertEqual(len(input_ids), max_length)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         self.skipTest(
             "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)"
@@ -244,6 +265,8 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = 384
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)

From 65a8a94adaf136dd677d28cfc837c0acfe993031 Mon Sep 17 00:00:00 2001
From: Raghu Ramarao <raghu.ramarao@gmail.com>
Date: Mon, 25 Nov 2024 18:30:00 +0530
Subject: [PATCH 40/50] Add ONNX Support for Decision Transformer Model (#2038)

* Decision Transformer to ONNX V0.1

* Decision Transformer to ONNX V0.2

* Update optimum/exporters/onnx/model_configs.py

* Apply suggestions from code review

* Update optimum/exporters/onnx/base.py

* Update optimum/exporters/onnx/model_configs.py

* Update optimum/utils/input_generators.py

* Update optimum/exporters/onnx/model_configs.py

* Apply suggestions from code review

* Update optimum/exporters/tasks.py

* ONNXToDT: changes to order of OrderedDict elements

* make style changes

* test

* remove custom normalized config

* remove unncessary dynamic axes

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Co-authored-by: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
---
 docs/source/exporters/onnx/overview.mdx |  1 +
 optimum/exporters/onnx/model_configs.py | 25 +++++++++++++++++
 optimum/exporters/tasks.py              |  9 ++++++
 optimum/utils/__init__.py               |  1 +
 optimum/utils/input_generators.py       | 37 +++++++++++++++++++++++++
 tests/exporters/exporters_utils.py      |  1 +
 6 files changed, 74 insertions(+)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 747e1396fb4..2eaada7dadd 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -36,6 +36,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Data2VecVision
 - Deberta
 - Deberta-v2
+- Decision Transformer
 - Deit
 - Detr
 - DistilBert
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 8984162ee8c..bca7cf24acf 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -27,6 +27,7 @@
     BloomDummyPastKeyValuesGenerator,
     DummyAudioInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
+    DummyDecisionTransformerInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
     DummyFluxTransformerTextInputGenerator,
@@ -263,6 +264,30 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig):
     pass
 
 
+class DecisionTransformerOnnxConfig(OnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyDecisionTransformerInputGenerator,)
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "states": {0: "batch_size", 1: "sequence_length"},
+            "actions": {0: "batch_size", 1: "sequence_length"},
+            "timesteps": {0: "batch_size", 1: "sequence_length"},
+            "returns_to_go": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "state_preds": {0: "batch_size", 1: "sequence_length"},
+            "action_preds": {0: "batch_size", 1: "sequence_length"},
+            "return_preds": {0: "batch_size", 1: "sequence_length"},
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+
+
 class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads")
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index b4bce4696f3..8f28ec42ce9 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -217,6 +217,7 @@ class TasksManager:
             "multiple-choice": "AutoModelForMultipleChoice",
             "object-detection": "AutoModelForObjectDetection",
             "question-answering": "AutoModelForQuestionAnswering",
+            "reinforcement-learning": "AutoModel",
             "semantic-segmentation": "AutoModelForSemanticSegmentation",
             "text-to-audio": ("AutoModelForTextToSpectrogram", "AutoModelForTextToWaveform"),
             "text-generation": "AutoModelForCausalLM",
@@ -574,6 +575,11 @@ class TasksManager:
             onnx="DebertaV2OnnxConfig",
             tflite="DebertaV2TFLiteConfig",
         ),
+        "decision-transformer": supported_tasks_mapping(
+            "feature-extraction",
+            "reinforcement-learning",
+            onnx="DecisionTransformerOnnxConfig",
+        ),
         "deit": supported_tasks_mapping(
             "feature-extraction",
             "image-classification",
@@ -2085,6 +2091,9 @@ def get_model_from_task(
             if original_task == "automatic-speech-recognition" or task == "automatic-speech-recognition":
                 if original_task == "auto" and config.architectures is not None:
                     model_class_name = config.architectures[0]
+            elif original_task == "reinforcement-learning" or task == "reinforcement-learning":
+                if config.architectures is not None:
+                    model_class_name = config.architectures[0]
 
         if library_name == "diffusers":
             config = DiffusionPipeline.load_config(model_name_or_path, **kwargs)
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index fb1794af49c..2aa90253d08 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -53,6 +53,7 @@
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
+    DummyDecisionTransformerInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
     DummyFluxTransformerTextInputGenerator,
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 148072aa0b4..0ac1805f97d 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -507,6 +507,43 @@ class DummyDecoderTextInputGenerator(DummyTextInputGenerator):
     )
 
 
+class DummyDecisionTransformerInputGenerator(DummyTextInputGenerator):
+    """
+    Generates dummy decision transformer inputs.
+    """
+
+    SUPPORTED_INPUT_NAMES = (
+        "states",
+        "actions",
+        "timesteps",
+        "returns_to_go",
+        "attention_mask",
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.act_dim = self.normalized_config.config.act_dim
+        self.state_dim = self.normalized_config.config.state_dim
+        self.max_ep_len = self.normalized_config.config.max_ep_len
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "states":
+            shape = [self.batch_size, self.sequence_length, self.state_dim]
+        elif input_name == "actions":
+            shape = [self.batch_size, self.sequence_length, self.act_dim]
+        elif input_name == "rewards":
+            shape = [self.batch_size, self.sequence_length, 1]
+        elif input_name == "returns_to_go":
+            shape = [self.batch_size, self.sequence_length, 1]
+        elif input_name == "attention_mask":
+            shape = [self.batch_size, self.sequence_length]
+        elif input_name == "timesteps":
+            shape = [self.batch_size, self.sequence_length]
+            return self.random_int_tensor(shape=shape, max_value=self.max_ep_len, framework=framework, dtype=int_dtype)
+
+        return self.random_float_tensor(shape, min_value=-2.0, max_value=2.0, framework=framework, dtype=float_dtype)
+
+
 class DummySeq2SeqDecoderTextInputGenerator(DummyDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "decoder_input_ids",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 31059c403de..c56132c384c 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -67,6 +67,7 @@
     "data2vec-audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
     "deberta": "hf-internal-testing/tiny-random-DebertaModel",
     "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model",
+    "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium",
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel",

From a6c696c7de105e7691d432dd80102beec78d8fd4 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 26 Nov 2024 20:52:43 +0100
Subject: [PATCH 41/50] Generate guidance for flux (#2104)

generate guidance
---
 optimum/onnxruntime/modeling_diffusion.py | 17 +++++++++++++++--
 optimum/utils/input_generators.py         |  4 ++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 79d302be449..66b08e1ef66 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -437,8 +437,21 @@ def to(self, device: Union[torch.device, str, int]):
     def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
         return cls.load_config(config_name_or_path, **kwargs)
 
-    def _save_config(self, save_directory):
-        self.save_config(save_directory)
+    def _save_config(self, save_directory: Union[str, Path]):
+        model_dir = (
+            self.model_save_dir
+            if not isinstance(self.model_save_dir, TemporaryDirectory)
+            else self.model_save_dir.name
+        )
+        save_dir = Path(save_directory)
+        original_config = Path(model_dir) / self.config_name
+        if original_config.exists():
+            if not save_dir.exists():
+                save_dir.mkdir(parents=True)
+
+            shutil.copy(original_config, save_dir)
+        else:
+            self.save_config(save_directory)
 
     @property
     def components(self) -> Dict[str, Any]:
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 0ac1805f97d..fbb77e6800a 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -1508,6 +1508,7 @@ class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator)
     SUPPORTED_INPUT_NAMES = (
         "encoder_hidden_states",
         "pooled_projections",
+        "guidance",
         "txt_ids",
     )
 
@@ -1519,5 +1520,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 else [self.batch_size, self.sequence_length, 3]
             )
             return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype)
+        elif input_name == "guidance":
+            shape = [self.batch_size]
+            return self.random_float_tensor(shape, min_value=0, max_value=1, framework=framework, dtype=float_dtype)
 
         return super().generate(input_name, framework, int_dtype, float_dtype)

From bd08f12d2d4ebffdb2a25e32eabab759e4de88e5 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Thu, 28 Nov 2024 15:13:11 +0100
Subject: [PATCH 42/50] Unbundle inputs generated by
 `DummyTimestepInputGenerator` (#2107)

unbundle
---
 optimum/utils/input_generators.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index fbb77e6800a..18a2a5a3fd1 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -897,14 +897,14 @@ def __init__(
     ):
         self.task = task
         self.vocab_size = normalized_config.vocab_size
-        self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim
-        self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6
+        self.text_encoder_projection_dim = getattr(normalized_config, "text_encoder_projection_dim", None)
+        self.time_ids = 5 if getattr(normalized_config, "requires_aesthetics_score", False) else 6
         if random_batch_size_range:
             low, high = random_batch_size_range
             self.batch_size = random.randint(low, high)
         else:
             self.batch_size = batch_size
-        self.time_cond_proj_dim = normalized_config.config.time_cond_proj_dim
+        self.time_cond_proj_dim = getattr(normalized_config.config, "time_cond_proj_dim", None)
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "timestep":
@@ -912,8 +912,16 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
             return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
 
         if input_name == "text_embeds":
+            if self.text_encoder_projection_dim is None:
+                raise ValueError(
+                    "Unable to infer the value of `text_encoder_projection_dim` for generating `text_embeds`, please double check the config of your model."
+                )
             dim = self.text_encoder_projection_dim
         elif input_name == "timestep_cond":
+            if self.time_cond_proj_dim is None:
+                raise ValueError(
+                    "Unable to infer the value of `time_cond_proj_dim` for generating `timestep_cond`, please double check the config of your model."
+                )
             dim = self.time_cond_proj_dim
         else:
             dim = self.time_ids

From 28bd0ad8fccfb6dd8019cd2882a88d69386a134c Mon Sep 17 00:00:00 2001
From: Brando Tovar <44623235+bndos@users.noreply.github.com>
Date: Thu, 28 Nov 2024 10:13:05 -0500
Subject: [PATCH 43/50] Pass the revision to SentenceTransformer models (#2105)

feat: pass revision to SentenceTransformers
---
 optimum/exporters/tasks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 8f28ec42ce9..c50fa5cdfa4 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -2128,6 +2128,7 @@ def get_model_from_task(
                 device=device,
                 cache_folder=cache_folder,
                 token=token,
+                revision=revision,
                 trust_remote_code=trust_remote_code,
             )
         else:

From f22655c036e4e61a7b09748e7aa7e146a16ae64d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Mlyn=C3=A1=C5=99?=
 <47664722+mlynatom@users.noreply.github.com>
Date: Mon, 2 Dec 2024 14:54:08 +0100
Subject: [PATCH 44/50] Add RemBERT ONNX support (#2108)

* ONNX config for RemBERT added

* added RemBERT to TasksManager

* rembert added to exporters_utils

* RemBERT added to test modelling tasks

* changed rembert model

* added RemBERT to test utils

* Added RemBERT to documentation

* Apply suggestions from code review

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 4 ++++
 optimum/exporters/tasks.py                   | 9 +++++++++
 tests/exporters/exporters_utils.py           | 3 ++-
 tests/onnxruntime/test_modeling.py           | 5 +++++
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 6 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 2eaada7dadd..57005b85678 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -83,6 +83,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - PoolFormer
 - Qwen2(Qwen1.5)
 - RegNet
+- RemBERT
 - ResNet
 - Roberta
 - Roformer
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index bca7cf24acf..b39d19ec782 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -162,6 +162,10 @@ class SplinterOnnxConfig(BertOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class RemBertOnnxConfig(BertOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
 class DistilBertOnnxConfig(BertOnnxConfig):
     DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index c50fa5cdfa4..0a3758e97cf 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -431,6 +431,15 @@ class TasksManager:
             onnx="BertOnnxConfig",
             tflite="BertTFLiteConfig",
         ),
+        "rembert": supported_tasks_mapping(
+            "fill-mask",
+            "feature-extraction",
+            "text-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx="RemBertOnnxConfig",
+        ),
         # For big-bird and bigbird-pegasus being unsupported, refer to model_configs.py
         # "big-bird": supported_tasks_mapping(
         #     "feature-extraction",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index c56132c384c..32156d9eebf 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -138,6 +138,7 @@
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     # "rembert": "google/rembert",
+    "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
     "regnet": "hf-internal-testing/tiny-random-RegNetModel",
@@ -257,7 +258,7 @@
     "owlv2": "google/owlv2-base-patch16",
     "owlvit": "google/owlvit-base-patch32",
     "perceiver": "hf-internal-testing/tiny-random-PerceiverModel",  # Not using deepmind/language-perceiver because it takes too much time for testing.
-    # "rembert": "google/rembert",
+    "rembert": "google/rembert",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "regnet": "facebook/regnet-y-040",
     "resnet": "microsoft/resnet-50",
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index c4340dcd8b6..8f52ef45180 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -1312,6 +1312,7 @@ class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm_qa",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1502,6 +1503,7 @@ class ORTModelForMaskedLMIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1682,6 +1684,7 @@ class ORTModelForSequenceClassificationIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1882,6 +1885,7 @@ class ORTModelForTokenClassificationIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -2227,6 +2231,7 @@ class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index ba8f6cc4abc..cccecd53817 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -135,6 +135,7 @@
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
+    "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "resnet": "hf-internal-testing/tiny-random-resnet",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",

From 3ba10576e755f8e0740251c891082ee96e722afa Mon Sep 17 00:00:00 2001
From: "Tang, Wenyi" <wenyitang@outlook.com>
Date: Mon, 2 Dec 2024 22:55:04 +0800
Subject: [PATCH 45/50] Fix `ModelPatcher` returns empty outputs (#2109)

* fix bug `ModelPatcher` returns empty outputs

When model's output is tuple or list, `filtered_outputs`
doesn't get assigned and hence always a empty dict

* typo

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/exporters/onnx/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index fdfb0e280f5..2c0f9aeba67 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -168,7 +168,7 @@ def patched_forward(*args, **kwargs):
                         filterd_outputs[name] = value
             elif isinstance(outputs, (list, tuple)):
                 outputs_list = list(config.outputs.keys())
-                dict(zip(outputs_list, outputs))
+                filterd_outputs = dict(zip(outputs_list, outputs))
             else:
                 if len(config.outputs) > 1:
                     num_outputs = len(config.outputs)

From ff8c8fc95cb03b6ce72e0812bf0294bb2ae4463a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 3 Dec 2024 17:00:05 +0100
Subject: [PATCH 46/50] Fix workflow to mark issues as stale (#2110)

* add permissions

* update stale message
---
 .github/workflows/stale.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index a5e50a795b6..7b3eb5feb0c 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -6,9 +6,12 @@ on:
 jobs:
   stale:
     runs-on: ubuntu-latest
+    permissions:
+      issues: write
     steps:
-      - uses: actions/stale@v8
+      - uses: actions/stale@v9
         with:
-          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.'
+          exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization'
           days-before-stale: 30
           days-before-close: 5

From 01110adf076c94e395d1472a760eafac2c0a73aa Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 3 Dec 2024 17:11:16 +0100
Subject: [PATCH 47/50] Remove doc-build (#2111)

---
 .github/workflows/build_main_documentation.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index c922f5097da..d38274f320a 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -18,12 +18,6 @@ jobs:
           repository: 'huggingface/doc-builder'
           path: doc-builder
 
-      - uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/doc-build'
-          path: doc-build
-          token: ${{ secrets.HUGGINGFACE_PUSH }}
-
       - uses: actions/checkout@v2
         with:
           repository: 'huggingface/optimum'

From 7f2605ea94071f5495eac110ba240e2651ea8053 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 3 Dec 2024 19:19:57 +0100
Subject: [PATCH 48/50] Downgrade stale bot to v8 and fix permissions (#2112)

---
 .github/workflows/stale.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 7b3eb5feb0c..28cf3ad9dc2 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -3,13 +3,14 @@ on:
   schedule:
     - cron: '30 1 * * *'
 
+permissions:
+  issues: write
+
 jobs:
   stale:
     runs-on: ubuntu-latest
-    permissions:
-      issues: write
     steps:
-      - uses: actions/stale@v9
+      - uses: actions/stale@v8
         with:
           stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.'
           exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization'

From d6de6762e0e4bf8136f0435211a0e777f5bf2f33 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 3 Dec 2024 19:20:09 +0100
Subject: [PATCH 49/50] Update documentation color from google tpu section
 (#2113)

* Update documentation color from google tpu section

* fix
---
 docs/source/index.mdx | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 06133664ca8..1b54570ea80 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -43,7 +43,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
       <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/trainium/', '_blank');">AWS Trainium</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/inferentia/', '_blank');">AWS Inferentia</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-tpu/index"
-      ><div class="w-full text-center bg-gradient-to-tr from-blue-200 to-blue-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Google TPUs</div>
+      ><div class="w-full text-center bg-gradient-to-tr from-blue-500 to-blue-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Google TPUs</div>
       <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://cloud.google.com/tpu', '_blank');">Google TPUs</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./habana/index"
@@ -57,10 +57,6 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
   </div>
 </div>
 
-> [!TIP]
-> Some packages provide hardware-agnostic features (e.g. INC interface in Optimum Intel).
-
-
 ## Open-source integrations
 
 🤗 Optimum also supports a variety of open-source frameworks to make model optimization very easy.

From 4a7cb298140ee9bed968d98a780a950d15bb2935 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 4 Dec 2024 17:04:37 +0100
Subject: [PATCH 50/50] Fix workflow to mark PRs as stale (#2116)

---
 .github/workflows/stale.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 28cf3ad9dc2..6dc3ff2bbd9 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -5,6 +5,7 @@ on:
 
 permissions:
   issues: write
+  pull-requests: write
 
 jobs:
   stale:
@@ -13,6 +14,10 @@ jobs:
       - uses: actions/stale@v8
         with:
           stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.'
+          stale-pr-message: 'This PR has been marked as stale because it has been open for 90 days with no activity. This thread will be automatically closed in 30 days if no further activity occurs.'
           exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization'
-          days-before-stale: 30
-          days-before-close: 5
+          days-before-issue-stale: 30
+          days-before-issue-close: 5
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          exempt-all-pr-assignees: true
\ No newline at end of file