From fcb169022962e8a33cd408eae566ab318696f5a7 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 16 Jul 2024 09:53:42 +0200
Subject: [PATCH 01/24] created auto task mappings

---
 optimum/onnxruntime/modeling_diffusion.py | 43 +++++++++++++++++------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index f4e54752115..3e5aed3fb01 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -18,6 +18,7 @@
 import shutil
 import warnings
 from abc import abstractmethod
+from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, Optional, Union
@@ -26,6 +27,7 @@
 import torch
 from diffusers import (
     DDIMScheduler,
+    DiffusionPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
@@ -69,8 +71,8 @@
 logger = logging.getLogger(__name__)
 
 
-class ORTStableDiffusionPipelineBase(ORTModel):
-    auto_model_class = StableDiffusionPipeline
+class ORTDiffusionPipeline(ORTModel):
+    auto_model_class = DiffusionPipeline
     main_input_name = "input_ids"
     base_model_prefix = "onnx_model"
     config_name = "model_index.json"
@@ -350,9 +352,9 @@ def _from_pretrained(
             text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
             unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
             vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            text_encoder_2_path=new_model_save_dir
-            / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER
-            / text_encoder_2_file_name,
+            text_encoder_2_path=(
+                new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name
+            ),
             provider=provider,
             session_options=session_options,
             provider_options=provider_options,
@@ -561,7 +563,7 @@ def forward(self, sample: np.ndarray):
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
@@ -570,7 +572,7 @@ class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusion
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
@@ -579,7 +581,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDi
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
@@ -588,7 +590,7 @@ class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDi
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
@@ -596,7 +598,7 @@ class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentCo
     __call__ = LatentConsistencyPipelineMixin.__call__
 
 
-class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
+class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
     auto_model_class = StableDiffusionXLImg2ImgPipeline
 
     def __init__(
@@ -661,3 +663,24 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     """
 
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+
+
+AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
+    ]
+)
+
+AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
+    ]
+)
+
+AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+    ]
+)

From 1cbb5448845036104648c6c20267a041a4568250 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 18 Jul 2024 16:50:32 +0200
Subject: [PATCH 02/24] added correct auto classes

---
 optimum/modeling_base.py                  |  9 ++++++---
 optimum/onnxruntime/modeling_diffusion.py | 24 +++++++++++++----------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 5bab0622de4..3da2d9d0d21 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -85,7 +85,6 @@ class PreTrainedModel(ABC):  # noqa: F811
 
 class OptimizedModel(PreTrainedModel):
     config_class = AutoConfig
-    load_tf_weights = None
     base_model_prefix = "optimized_model"
     config_name = CONFIG_NAME
 
@@ -378,10 +377,14 @@ def from_pretrained(
                 )
             model_id, revision = model_id.split("@")
 
-        library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token)
+        library_name = TasksManager.infer_library_from_model(
+            model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+        )
 
         if library_name == "timm":
-            config = PretrainedConfig.from_pretrained(model_id, subfolder, revision)
+            config = PretrainedConfig.from_pretrained(
+                model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         if config is None:
             if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 3e5aed3fb01..59732e63eae 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -28,10 +28,14 @@
 from diffusers import (
     DDIMScheduler,
     DiffusionPipeline,
+    LatentConsistencyModelPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
@@ -73,11 +77,13 @@
 
 class ORTDiffusionPipeline(ORTModel):
     auto_model_class = DiffusionPipeline
-    main_input_name = "input_ids"
+    main_input_name = "prompt"
     base_model_prefix = "onnx_model"
     config_name = "model_index.json"
     sub_component_config_name = "config.json"
 
+    # TODO: instead of having a bloated init, we should probably have an init per pipeline,
+    # so that we can easily add new pipelines without having to modify the base class
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -401,7 +407,7 @@ def _from_transformers(
         provider_options: Optional[Dict[str, Any]] = None,
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTStableDiffusionPipeline":
+    ) -> "ORTDiffusionPipeline":
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
@@ -568,7 +574,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
-    __call__ = StableDiffusionPipelineMixin.__call__
+    auto_model_class = StableDiffusionPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -577,7 +583,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
+    auto_model_class = StableDiffusionImg2ImgPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -586,7 +592,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
-    __call__ = StableDiffusionInpaintPipelineMixin.__call__
+    auto_model_class = StableDiffusionInpaintPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -595,12 +601,10 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
-    __call__ = LatentConsistencyPipelineMixin.__call__
+    auto_model_class = LatentConsistencyModelPipeline
 
 
 class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
-    auto_model_class = StableDiffusionXLImg2ImgPipeline
-
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -653,7 +657,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
-    __call__ = StableDiffusionXLPipelineMixin.__call__
+    auto_model_class = StableDiffusionXLPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -662,7 +666,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
 
 
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(

From cdba70ea788938f2c632132606f64a95e476b761 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 16 Jul 2024 09:53:42 +0200
Subject: [PATCH 03/24] created auto task mappings

---
 optimum/onnxruntime/modeling_diffusion.py | 43 +++++++++++++++++------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index f4e54752115..3e5aed3fb01 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -18,6 +18,7 @@
 import shutil
 import warnings
 from abc import abstractmethod
+from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, Optional, Union
@@ -26,6 +27,7 @@
 import torch
 from diffusers import (
     DDIMScheduler,
+    DiffusionPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
@@ -69,8 +71,8 @@
 logger = logging.getLogger(__name__)
 
 
-class ORTStableDiffusionPipelineBase(ORTModel):
-    auto_model_class = StableDiffusionPipeline
+class ORTDiffusionPipeline(ORTModel):
+    auto_model_class = DiffusionPipeline
     main_input_name = "input_ids"
     base_model_prefix = "onnx_model"
     config_name = "model_index.json"
@@ -350,9 +352,9 @@ def _from_pretrained(
             text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
             unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
             vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            text_encoder_2_path=new_model_save_dir
-            / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER
-            / text_encoder_2_file_name,
+            text_encoder_2_path=(
+                new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name
+            ),
             provider=provider,
             session_options=session_options,
             provider_options=provider_options,
@@ -561,7 +563,7 @@ def forward(self, sample: np.ndarray):
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
@@ -570,7 +572,7 @@ class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusion
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
@@ -579,7 +581,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDi
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
@@ -588,7 +590,7 @@ class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDi
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
@@ -596,7 +598,7 @@ class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentCo
     __call__ = LatentConsistencyPipelineMixin.__call__
 
 
-class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
+class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
     auto_model_class = StableDiffusionXLImg2ImgPipeline
 
     def __init__(
@@ -661,3 +663,24 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     """
 
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+
+
+AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
+    ]
+)
+
+AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
+    ]
+)
+
+AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+    ]
+)

From 5bebbd52040c9d057bbbf36c6c9d78fec2f785a0 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 18 Jul 2024 16:50:32 +0200
Subject: [PATCH 04/24] added correct auto classes

---
 optimum/modeling_base.py                  |  9 ++++++---
 optimum/onnxruntime/modeling_diffusion.py | 24 +++++++++++++----------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 5bab0622de4..3da2d9d0d21 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -85,7 +85,6 @@ class PreTrainedModel(ABC):  # noqa: F811
 
 class OptimizedModel(PreTrainedModel):
     config_class = AutoConfig
-    load_tf_weights = None
     base_model_prefix = "optimized_model"
     config_name = CONFIG_NAME
 
@@ -378,10 +377,14 @@ def from_pretrained(
                 )
             model_id, revision = model_id.split("@")
 
-        library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token)
+        library_name = TasksManager.infer_library_from_model(
+            model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+        )
 
         if library_name == "timm":
-            config = PretrainedConfig.from_pretrained(model_id, subfolder, revision)
+            config = PretrainedConfig.from_pretrained(
+                model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         if config is None:
             if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 3e5aed3fb01..59732e63eae 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -28,10 +28,14 @@
 from diffusers import (
     DDIMScheduler,
     DiffusionPipeline,
+    LatentConsistencyModelPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
@@ -73,11 +77,13 @@
 
 class ORTDiffusionPipeline(ORTModel):
     auto_model_class = DiffusionPipeline
-    main_input_name = "input_ids"
+    main_input_name = "prompt"
     base_model_prefix = "onnx_model"
     config_name = "model_index.json"
     sub_component_config_name = "config.json"
 
+    # TODO: instead of having a bloated init, we should probably have an init per pipeline,
+    # so that we can easily add new pipelines without having to modify the base class
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -401,7 +407,7 @@ def _from_transformers(
         provider_options: Optional[Dict[str, Any]] = None,
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTStableDiffusionPipeline":
+    ) -> "ORTDiffusionPipeline":
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
@@ -568,7 +574,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
-    __call__ = StableDiffusionPipelineMixin.__call__
+    auto_model_class = StableDiffusionPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -577,7 +583,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
+    auto_model_class = StableDiffusionImg2ImgPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -586,7 +592,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
-    __call__ = StableDiffusionInpaintPipelineMixin.__call__
+    auto_model_class = StableDiffusionInpaintPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -595,12 +601,10 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
-    __call__ = LatentConsistencyPipelineMixin.__call__
+    auto_model_class = LatentConsistencyModelPipeline
 
 
 class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
-    auto_model_class = StableDiffusionXLImg2ImgPipeline
-
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -653,7 +657,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
-    __call__ = StableDiffusionXLPipelineMixin.__call__
+    auto_model_class = StableDiffusionXLPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -662,7 +666,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
 
 
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(

From 40b2ac0ab619725aed28c3def0df3987857be6b5 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 19 Jul 2024 09:22:03 +0200
Subject: [PATCH 05/24] added ort/auto diffusion classes

---
 optimum/onnxruntime/modeling_diffusion.py | 104 +++++++++++++++++++++-
 1 file changed, 101 insertions(+), 3 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 59732e63eae..a5fcdc0ae56 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -26,6 +26,10 @@
 import numpy as np
 import torch
 from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    ConfigMixin,
     DDIMScheduler,
     DiffusionPipeline,
     LatentConsistencyModelPipeline,
@@ -37,10 +41,16 @@
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLPipeline,
 )
+from diffusers.pipelines.auto_pipeline import (
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+    AUTO_INPAINT_PIPELINES_MAPPING,
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+)
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
 
@@ -576,6 +586,8 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi
 
     auto_model_class = StableDiffusionPipeline
 
+    __call__ = StableDiffusionPipelineMixin.__call__
+
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin):
@@ -585,6 +597,8 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg
 
     auto_model_class = StableDiffusionImg2ImgPipeline
 
+    __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
+
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin):
@@ -594,6 +608,8 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp
 
     auto_model_class = StableDiffusionInpaintPipeline
 
+    __call__ = StableDiffusionInpaintPipelineMixin.__call__
+
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin):
@@ -603,6 +619,8 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP
 
     auto_model_class = LatentConsistencyModelPipeline
 
+    __call__ = LatentConsistencyPipelineMixin.__call__
+
 
 class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
     def __init__(
@@ -659,6 +677,8 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
 
     auto_model_class = StableDiffusionXLPipeline
 
+    __call__ = StableDiffusionXLPipelineMixin.__call__
+
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
@@ -668,23 +688,101 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
 
     auto_model_class = StableDiffusionXLImg2ImgPipeline
 
+    __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+
 
-AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
+        ("lcm", ORTLatentConsistencyModelPipeline),
         ("stable-diffusion", ORTStableDiffusionPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
     ]
 )
 
-AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
         ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
     ]
 )
 
-AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
+ORT_INPAINT_PIPELINES_MAPPING = OrderedDict(
     [
         ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
     ]
 )
+
+
+def _get_task_class(ort_mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+    for model_type, ort_pipeline_class in ort_mapping.items():
+        if pipeline_class_name == ort_pipeline_class.auto_model_class.__name__:
+            return ort_pipeline_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"ORTPipeline can't find a pipeline linked to {pipeline_class_name}")
+
+
+class ORTPipelineBase(ConfigMixin):
+    config_name = "model_index.json"
+
+    ort_pipeline_mapping = None
+    auto_pipeline_mapping = None
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+
+        original_class_name = config["_class_name"]
+
+        pipeline_cls = _get_task_class(
+            cls.ort_pipeline_mapping,
+            cls.auto_pipeline_mapping,
+            original_class_name,
+        )
+
+        return pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, **kwargs):
+        raise NotImplementedError(
+            f"from_pipe is not yet implemented for {cls.__name__}. Please use from_pretrained instead."
+        )
+
+
+class ORTPipelineForText2Image(ORTPipelineBase):
+    auto_model_class = AutoPipelineForText2Image
+
+    ort_pipeline_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING
+    auto_pipeline_mapping = AUTO_TEXT2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForImage2Image(ORTPipelineBase):
+    auto_model_class = AutoPipelineForImage2Image
+
+    ort_pipeline_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING
+    auto_pipeline_mapping = AUTO_IMAGE2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForInpainting(ORTPipelineBase):
+    auto_model_class = AutoPipelineForInpainting
+
+    ort_pipeline_mapping = ORT_INPAINT_PIPELINES_MAPPING
+    auto_pipeline_mapping = AUTO_INPAINT_PIPELINES_MAPPING

From 29bfe57c01ff7c74503e094f01430168c3763b53 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 31 Jul 2024 15:07:53 +0200
Subject: [PATCH 06/24] fix ORTPipeline detection

---
 optimum/onnxruntime/__init__.py           |  6 ++++
 optimum/onnxruntime/modeling_diffusion.py | 42 +++++++++++++++--------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index f1d4f63a9ff..35cbf14587e 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -88,6 +88,9 @@
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
+        "ORTPipelineForText2Image",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
     ]
 
 
@@ -147,6 +150,9 @@
     else:
         from .modeling_diffusion import (
             ORTLatentConsistencyModelPipeline,
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index a5fcdc0ae56..982dd123343 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -712,14 +712,32 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ]
 )
 
-
-def _get_task_class(ort_mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
-    for model_type, ort_pipeline_class in ort_mapping.items():
-        if pipeline_class_name == ort_pipeline_class.auto_model_class.__name__:
-            return ort_pipeline_class
+SUPPORTED_TASKS_MAPPINGS = [
+    ORT_TEXT2IMAGE_PIPELINES_MAPPING,
+    ORT_IMAGE2IMAGE_PIPELINES_MAPPING,
+    ORT_INPAINT_PIPELINES_MAPPING,
+]
+
+
+def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+    def get_model(pipeline_class_name):
+        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
+            for model_name, pipeline in task_mapping.items():
+                if (
+                    pipeline.__name__ == pipeline_class_name
+                    or pipeline.auto_model_class.__name__ == pipeline_class_name
+                ):
+                    return model_name
+
+    model_name = get_model(pipeline_class_name)
+
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
 
     if throw_error_if_not_exist:
-        raise ValueError(f"ORTPipeline can't find a pipeline linked to {pipeline_class_name}")
+        raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")
 
 
 class ORTPipelineBase(ConfigMixin):
@@ -749,16 +767,12 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         }
 
         config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
 
-        original_class_name = config["_class_name"]
-
-        pipeline_cls = _get_task_class(
-            cls.ort_pipeline_mapping,
-            cls.auto_pipeline_mapping,
-            original_class_name,
-        )
+        ort_pipeline_cls = _get_task_class(cls.ort_pipeline_mapping, class_name)
 
-        return pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+        return ort_pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs)
 
     @classmethod
     def from_pipe(cls, **kwargs):

From f6df38ccca773e3de0cc55e567b0593fca4ece12 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 31 Jul 2024 15:08:13 +0200
Subject: [PATCH 07/24] start test refactoring

---
 optimum/utils/testing_utils.py                |  11 +
 tests/onnxruntime/test_diffusion.py           | 730 ++++++++++++++++++
 .../test_stable_diffusion_pipeline.py         | 562 --------------
 tests/onnxruntime/utils_onnxruntime_tests.py  |  15 +-
 4 files changed, 752 insertions(+), 566 deletions(-)
 create mode 100644 tests/onnxruntime/test_diffusion.py
 delete mode 100644 tests/onnxruntime/test_stable_diffusion_pipeline.py

diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 76fe9a05b13..6579e230dc8 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -84,6 +84,17 @@ def require_ort_rocm(test_case):
     )
 
 
+def require_ort_cuda(test_case):
+    """Decorator marking a test that requires CUDAExecutionProvider for ONNX Runtime."""
+    import onnxruntime as ort
+
+    providers = ort.get_available_providers()
+
+    return unittest.skipUnless("CUDAExecutionProvider" == providers[0], "test requires CUDAExecutionProvider")(
+        test_case
+    )
+
+
 def require_hf_token(test_case):
     """
     Decorator marking a test that requires huggingface hub token.
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
new file mode 100644
index 00000000000..2d5ab7a7f8b
--- /dev/null
+++ b/tests/onnxruntime/test_diffusion.py
@@ -0,0 +1,730 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import unittest
+from typing import Dict
+
+import numpy as np
+import PIL
+import pytest
+import torch
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+)
+from diffusers.utils import load_image
+from diffusers.utils.testing_utils import floats_tensor
+from packaging.version import Version, parse
+from parameterized import parameterized
+from transformers.testing_utils import require_torch_gpu
+from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
+
+from optimum.onnxruntime import (
+    ORTLatentConsistencyModelPipeline,
+    ORTPipelineForImage2Image,
+    ORTPipelineForInpainting,
+    ORTPipelineForText2Image,
+    ORTStableDiffusionImg2ImgPipeline,
+    ORTStableDiffusionInpaintPipeline,
+    ORTStableDiffusionPipeline,
+    ORTStableDiffusionXLImg2ImgPipeline,
+    ORTStableDiffusionXLPipeline,
+)
+from optimum.onnxruntime.modeling_diffusion import (
+    ORTModelTextEncoder,
+    ORTModelUnet,
+    ORTModelVaeDecoder,
+    ORTModelVaeEncoder,
+)
+from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
+from optimum.utils.import_utils import _diffusers_version
+from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
+
+
+if parse(_diffusers_version) > Version("0.21.4"):
+    from diffusers import LatentConsistencyModelPipeline
+
+
+def _generate_inputs(batch_size=1):
+    inputs = {
+        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+        "num_inference_steps": 3,
+        "guidance_scale": 7.5,
+        "output_type": "np",
+    }
+    return inputs
+
+
+def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+    if input_type == "pil":
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        ).resize((width, height))
+    elif input_type == "np":
+        image = np.random.rand(height, width, channel)
+    elif input_type == "pt":
+        image = torch.rand((channel, height, width))
+
+    return [image] * batch_size
+
+
+def to_np(image):
+    if isinstance(image[0], PIL.Image.Image):
+        return np.stack([np.array(i) for i in image], axis=0)
+    elif isinstance(image, torch.Tensor):
+        return image.cpu().numpy().transpose(0, 2, 3, 1)
+    return image
+
+
+class ORTPipelineForText2ImageTest(ORTModelTestMixin):
+    ARCHITECTURE_TO_ORTMODEL_CLASS = {
+        "stable-diffusion": ORTStableDiffusionPipeline,
+        "stable-diffusion-xl": ORTStableDiffusionXLPipeline,
+        "lcm": ORTLatentConsistencyModelPipeline,
+    }
+
+    AUTOMODEL_CLASS = AutoPipelineForText2Image
+    ORTMODEL_CLASS = ORTPipelineForText2Image
+
+    TASK = "text-to-image"
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertIsInstance(pipeline, self.ARCHITECTURE_TO_ORTMODEL_CLASS[model_arch])
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        batch_size, height = 1, 32
+        for width in [64, 32]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+            for num_images in [1, 3]:
+                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
+            callback_fn.has_been_called = True
+            callback_fn.number_of_steps += 1
+
+        callback_fn.has_been_called = False
+        callback_fn.number_of_steps = 0
+
+        inputs = self.generate_inputs(height=64, width=64)
+        pipeline(**inputs, callback=callback_fn, callback_steps=1)
+        self.assertTrue(callback_fn.has_been_called)
+        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        height, width, batch_size = 128, 64, 1
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for output_type in ["np", "pil", "latent"]:
+            inputs["output_type"] = output_type
+            outputs = pipeline(**inputs).images
+            if output_type == "pil":
+                self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+            elif output_type == "np":
+                self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            else:
+                self.assertEqual(
+                    outputs.shape,
+                    (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                )
+
+    @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"])
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        inputs = _generate_inputs()
+        height, width = 64, 64
+        np.random.seed(0)
+        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
+        np.random.seed(0)
+        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
+        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
+        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+    @parameterized.expand(["stable-diffusion"])
+    def test_negative_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        negative_prompt = ["This is a negative prompt"]
+        np.random.seed(0)
+        image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
+        prompt = inputs.pop("prompt")
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = pipeline.tokenizer(
+                p,
+                padding="max_length",
+                max_length=pipeline.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
+            embeds.append(pipeline.text_encoder(text_inputs)[0])
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+        np.random.seed(0)
+        image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
+        self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
+
+    def generate_inputs(self, height=128, width=128, batch_size=1):
+        inputs = _generate_inputs(batch_size=batch_size)
+        inputs["height"] = height
+        inputs["width"] = width
+        return inputs
+
+
+class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
+    ARCHITECTURE_TO_ORTMODEL_CLASS = {
+        "stable-diffusion": ORTStableDiffusionImg2ImgPipeline,
+        "stable-diffusion-xl": ORTStableDiffusionXLImg2ImgPipeline,
+    }
+    AUTOMODEL_CLASS = AutoPipelineForImage2Image
+    ORTMODEL_CLASS = ORTPipelineForImage2Image
+
+    TASK = "image-to-image"
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        batch_size, height = 1, 32
+        for width in [64, 32]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+            for num_images in [1, 3]:
+                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
+            callback_fn.has_been_called = True
+            callback_fn.number_of_steps += 1
+
+        pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+        callback_fn.has_been_called = False
+        callback_fn.number_of_steps = 0
+        inputs = self.generate_inputs(height=64, width=64)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        self.assertTrue(callback_fn.has_been_called)
+        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        height, width, batch_size = 128, 64, 1
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for input_type in ["np", "pil", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["np", "pil", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                else:
+                    self.assertEqual(
+                        outputs.shape,
+                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                    )
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
+        inputs = _generate_inputs(batch_size=batch_size)
+        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
+        inputs["strength"] = 0.75
+        return inputs
+
+    # @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    # @require_diffusers
+    # def test_shape(self, model_arch: str):
+    #     model_args = {"test_name": model_arch, "model_arch": model_arch}
+    #     self._setup(model_args)
+    #     height, width, batch_size = 128, 64, 1
+    #     pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+    #     if self.TASK == "image-to-image":
+    #         input_types = ["np", "pil", "pt"]
+    #     elif self.TASK == "text-to-image":
+    #         input_types = ["np"]
+    #     else:
+    #         input_types = ["pil"]
+
+    #     for input_type in input_types:
+    #         if self.TASK == "image-to-image":
+    #             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+    #         else:
+    #             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+    #         for output_type in ["np", "pil", "latent"]:
+    #             inputs["output_type"] = output_type
+    #             outputs = pipeline(**inputs).images
+    #             if output_type == "pil":
+    #                 self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+    #             elif output_type == "np":
+    #                 self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+    #             else:
+    #                 self.assertEqual(
+    #                     outputs.shape,
+    #                     (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+    #                 )
+
+
+# class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase):
+#     SUPPORTED_ARCHITECTURES = ["stable-diffusion"]
+#     ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline
+#     TASK = "image-to-image"
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_compare_diffusers_pipeline(self, model_arch: str):
+#         model_args = {"test_name": model_arch, "model_arch": model_arch}
+#         self._setup(model_args)
+#         height, width = 128, 128
+
+#         inputs = self.generate_inputs(height=height, width=width)
+#         inputs["prompt"] = "A painting of a squirrel eating a burger"
+#         inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED))
+
+#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+#         ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
+
+#         diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+#         diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
+
+#         self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1))
+
+
+# class ORTStableDiffusionPipelineTest(unittest.TestCase):
+#     SUPPORTED_ARCHITECTURES = [
+#         "stable-diffusion",
+#     ]
+#     ORTMODEL_CLASS = ORTStableDiffusionPipeline
+#     TASK = "text-to-image"
+
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_image_reproducibility(self, model_arch: str):
+#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+#         inputs = _generate_inputs()
+#         height, width = 64, 32
+#         np.random.seed(0)
+#         ort_outputs_1 = pipeline(**inputs, height=height, width=width)
+#         np.random.seed(0)
+#         ort_outputs_2 = pipeline(**inputs, height=height, width=width)
+#         ort_outputs_3 = pipeline(**inputs, height=height, width=width)
+#         # Compare model outputs
+#         self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+#         self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     def test_negative_prompt(self, model_arch: str):
+#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+#         inputs = _generate_inputs()
+#         inputs["height"], inputs["width"] = 64, 32
+#         negative_prompt = ["This is a negative prompt"]
+#         np.random.seed(0)
+#         image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
+#         prompt = inputs.pop("prompt")
+#         embeds = []
+#         for p in [prompt, negative_prompt]:
+#             text_inputs = pipeline.tokenizer(
+#                 p,
+#                 padding="max_length",
+#                 max_length=pipeline.tokenizer.model_max_length,
+#                 truncation=True,
+#                 return_tensors="np",
+#             )
+#             text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
+#             embeds.append(pipeline.text_encoder(text_inputs)[0])
+
+#         inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+#         np.random.seed(0)
+#         image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
+#         self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
+
+
+# class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
+#     SUPPORTED_ARCHITECTURES = [
+#         "stable-diffusion-xl",
+#     ]
+#     ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
+#     TASK = "text-to-image"
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_compare_to_diffusers(self, model_arch: str):
+#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+#         self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
+#         self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
+#         self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
+#         self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
+#         self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
+#         self.assertIsInstance(ort_pipeline.config, Dict)
+
+#         pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
+#         batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
+#         latents = ort_pipeline.prepare_latents(
+#             batch_size * num_images_per_prompt,
+#             ort_pipeline.unet.config["in_channels"],
+#             height,
+#             width,
+#             dtype=np.float32,
+#             generator=np.random.RandomState(0),
+#         )
+
+#         kwargs = {
+#             "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+#             "num_inference_steps": 1,
+#             "num_images_per_prompt": num_images_per_prompt,
+#             "height": height,
+#             "width": width,
+#             "guidance_rescale": 0.1,
+#         }
+
+#         for output_type in ["latent", "np"]:
+#             ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
+#             self.assertIsInstance(ort_outputs, np.ndarray)
+#             with torch.no_grad():
+#                 outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+
+#             # Compare model outputs
+#             self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
+#             # Compare model devices
+#             self.assertEqual(pipeline.device, ort_pipeline.device)
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_image_reproducibility(self, model_arch: str):
+#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+#         inputs = _generate_inputs()
+#         height, width = 64, 32
+#         np.random.seed(0)
+#         ort_outputs_1 = pipeline(**inputs, height=height, width=width)
+#         np.random.seed(0)
+#         ort_outputs_2 = pipeline(**inputs, height=height, width=width)
+#         ort_outputs_3 = pipeline(**inputs, height=height, width=width)
+#         self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+#         self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+
+# class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
+#     SUPPORTED_ARCHITECTURES = [
+#         "stable-diffusion",
+#     ]
+#     ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline
+#     TASK = "inpainting"
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_compare_diffusers_pipeline(self, model_arch: str):
+#         model_args = {"test_name": model_arch, "model_arch": model_arch}
+#         self._setup(model_args)
+#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+#         diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch])
+#         height, width = 64, 64
+#         latents_shape = (
+#             1,
+#             ort_pipeline.vae_decoder.config["latent_channels"],
+#             height // ort_pipeline.vae_scale_factor,
+#             width // ort_pipeline.vae_scale_factor,
+#         )
+#         inputs = self.generate_inputs(height=height, width=width)
+
+#         np_latents = np.random.rand(*latents_shape).astype(np.float32)
+#         torch_latents = torch.from_numpy(np_latents)
+
+#         ort_outputs = ort_pipeline(**inputs, latents=np_latents).images
+#         self.assertEqual(ort_outputs.shape, (1, height, width, 3))
+
+#         diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images
+#         self.assertEqual(diffusers_outputs.shape, (1, height, width, 3))
+
+#         self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4))
+
+#     def generate_inputs(self, height=128, width=128, batch_size=1):
+#         inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width)
+#         inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
+#         inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
+#         return inputs
+
+
+# class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
+#     SUPPORTED_ARCHITECTURES = [
+#         "stable-diffusion-xl",
+#     ]
+#     ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
+#     TASK = "image-to-image"
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     def test_inference(self, model_arch: str):
+#         model_args = {"test_name": model_arch, "model_arch": model_arch}
+#         self._setup(model_args)
+#         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+#         height, width = 128, 128
+#         inputs = self.generate_inputs(height=height, width=width)
+#         inputs["image"] = load_image(
+#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+#             "/in_paint/overture-creations-5sI6fQgYIuo.png"
+#         ).resize((width, height))
+#         output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
+#         expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
+
+#         self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
+
+#     def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
+#         inputs = _generate_inputs(batch_size=batch_size)
+#         inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
+#         inputs["strength"] = 0.75
+#         return inputs
+
+
+# class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin):
+#     SUPPORTED_ARCHITECTURES = [
+#         "latent-consistency",
+#     ]
+#     ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline
+#     TASK = "text-to-image"
+
+#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+#     @require_diffusers
+#     @unittest.skipIf(
+#         parse(_diffusers_version) <= Version("0.21.4"),
+#         "not supported with this diffusers version, needs diffusers>=v0.22.0",
+#     )
+#     def test_compare_to_diffusers(self, model_arch: str):
+#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+#         self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
+#         self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
+#         self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
+#         self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
+#         self.assertIsInstance(ort_pipeline.config, Dict)
+
+#         pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
+#         batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
+#         latents = ort_pipeline.prepare_latents(
+#             batch_size * num_images_per_prompt,
+#             ort_pipeline.unet.config["in_channels"],
+#             height,
+#             width,
+#             dtype=np.float32,
+#             generator=np.random.RandomState(0),
+#         )
+
+#         kwargs = {
+#             "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+#             "num_inference_steps": 1,
+#             "num_images_per_prompt": num_images_per_prompt,
+#             "height": height,
+#             "width": width,
+#             "guidance_scale": 8.5,
+#         }
+
+#         for output_type in ["latent", "np"]:
+#             ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
+#             self.assertIsInstance(ort_outputs, np.ndarray)
+#             with torch.no_grad():
+#                 outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+
+#             # Compare model outputs
+#             self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
+#             # Compare model devices
+#             self.assertEqual(pipeline.device, ort_pipeline.device)
+
+
+class ImageProcessorTest(unittest.TestCase):
+    def test_vae_image_processor_pt(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt"))
+        input_np = to_np(input_pt)
+
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
+            out_np = to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
+
+    def test_vae_image_processor_np(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_np = np.stack(_create_image(height=8, width=8, input_type="np"))
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
+            out_np = to_np(out)
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
+
+    def test_vae_image_processor_pil(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
+        input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil")
+
+        for output_type in ["np", "pil"]:
+            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
+            for i, o in zip(input_pil, out):
+                in_np = np.array(i)
+                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
+                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py
deleted file mode 100644
index 44cd22ffecc..00000000000
--- a/tests/onnxruntime/test_stable_diffusion_pipeline.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-import unittest
-from typing import Dict
-
-import numpy as np
-import PIL
-import pytest
-import torch
-from diffusers import (
-    OnnxStableDiffusionImg2ImgPipeline,
-    StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
-)
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import floats_tensor
-from packaging.version import Version, parse
-from parameterized import parameterized
-from transformers.testing_utils import require_torch_gpu
-from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
-
-from optimum.onnxruntime import (
-    ORTLatentConsistencyModelPipeline,
-    ORTStableDiffusionImg2ImgPipeline,
-    ORTStableDiffusionInpaintPipeline,
-    ORTStableDiffusionPipeline,
-    ORTStableDiffusionXLImg2ImgPipeline,
-    ORTStableDiffusionXLPipeline,
-)
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
-from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
-from optimum.utils.import_utils import _diffusers_version
-from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
-
-
-if parse(_diffusers_version) > Version("0.21.4"):
-    from diffusers import LatentConsistencyModelPipeline
-
-
-def _generate_inputs(batch_size=1):
-    inputs = {
-        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-        "num_inference_steps": 3,
-        "guidance_scale": 7.5,
-        "output_type": "np",
-    }
-    return inputs
-
-
-def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
-    if input_type == "pil":
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-    elif input_type == "np":
-        image = np.random.rand(height, width, channel)
-    elif input_type == "pt":
-        image = torch.rand((channel, height, width))
-
-    return [image] * batch_size
-
-
-def to_np(image):
-    if isinstance(image[0], PIL.Image.Image):
-        return np.stack([np.array(i) for i in image], axis=0)
-    elif isinstance(image, torch.Tensor):
-        return image.cpu().numpy().transpose(0, 2, 3, 1)
-    return image
-
-
-class ORTStableDiffusionPipelineBase(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @require_diffusers
-    def test_load_vanilla_model_which_is_not_supported(self):
-        with self.assertRaises(Exception) as context:
-            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
-
-        self.assertIn(
-            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
-        )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_num_images_per_prompt(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
-
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_callback(self, model_arch: str):
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
-
-        pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
-        inputs = self.generate_inputs(height=64, width=64)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_shape(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width, batch_size = 128, 64, 1
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        if self.TASK == "image-to-image":
-            input_types = ["np", "pil", "pt"]
-        elif self.TASK == "text-to-image":
-            input_types = ["np"]
-        else:
-            input_types = ["pil"]
-
-        for input_type in input_types:
-            if self.TASK == "image-to-image":
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
-            else:
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for output_type in ["np", "pil", "latent"]:
-                inputs["output_type"] = output_type
-                outputs = pipeline(**inputs).images
-                if output_type == "pil":
-                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
-                elif output_type == "np":
-                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-                else:
-                    self.assertEqual(
-                        outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
-                    )
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["height"] = height
-        inputs["width"] = width
-        return inputs
-
-
-class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width = 128, 128
-
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["prompt"] = "A painting of a squirrel eating a burger"
-        inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED))
-
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ORTStableDiffusionPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        pipeline.safety_checker = None
-        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32
-
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": "sailing ship in storm by Leonardo da Vinci",
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        # Compare model outputs
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_negative_prompt(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        inputs["height"], inputs["width"] = 64, 32
-        negative_prompt = ["This is a negative prompt"]
-        np.random.seed(0)
-        image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
-        prompt = inputs.pop("prompt")
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = pipeline.tokenizer(
-                p,
-                padding="max_length",
-                max_length=pipeline.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
-            embeds.append(pipeline.text_encoder(text_inputs)[0])
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        np.random.seed(0)
-        image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
-        self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
-
-
-class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-
-class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline
-    TASK = "inpainting"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch])
-        height, width = 64, 64
-        latents_shape = (
-            1,
-            ort_pipeline.vae_decoder.config["latent_channels"],
-            height // ort_pipeline.vae_scale_factor,
-            width // ort_pipeline.vae_scale_factor,
-        )
-        inputs = self.generate_inputs(height=height, width=width)
-
-        np_latents = np.random.rand(*latents_shape).astype(np.float32)
-        torch_latents = torch.from_numpy(np_latents)
-
-        ort_outputs = ort_pipeline(**inputs, latents=np_latents).images
-        self.assertEqual(ort_outputs.shape, (1, height, width, 3))
-
-        diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images
-        self.assertEqual(diffusers_outputs.shape, (1, height, width, 3))
-
-        self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        return inputs
-
-
-class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_inference(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        height, width = 128, 128
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["image"] = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-        output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
-        expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
-
-        self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ImageProcessorTest(unittest.TestCase):
-    def test_vae_image_processor_pt(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt"))
-        input_np = to_np(input_pt)
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_np = np.stack(_create_image(height=8, width=8, input_type="np"))
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil")
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
-                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-
-class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "latent-consistency",
-    ]
-    ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    @unittest.skipIf(
-        parse(_diffusers_version) <= Version("0.21.4"),
-        "not supported with this diffusers version, needs diffusers>=v0.22.0",
-    )
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_scale": 8.5,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index bb6935461d7..e77b9b7c20b 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -108,7 +108,7 @@
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
-    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
+    "lcm": "echarlaix/tiny-random-latent-consistency",
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
@@ -213,9 +213,16 @@ def _setup(self, model_args: Dict):
                     continue
 
                 set_seed(SEED)
-                onnx_model = self.ORTMODEL_CLASS.from_pretrained(
-                    model_id, **model_args, use_io_binding=False, export=True
-                )
+                if hasattr(self, "ORTMODEL_CLASS"):
+                    onnx_model = self.ORTMODEL_CLASS.from_pretrained(
+                        model_id, **model_args, use_io_binding=False, export=True
+                    )
+                elif hasattr(self, "ORTPIPELINE_CLASS"):
+                    onnx_model = self.ORTPIPELINE_CLASS.from_pretrained(
+                        model_id, **model_args, use_io_binding=False, export=True
+                    )
+                else:
+                    raise ValueError("ORTMODEL_CLASS or ORTPIPELINE_CLASS must be defined")
 
                 model_dir = tempfile.mkdtemp(
                     prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}"

From 3123ea5fa6c201c86cb023c6301aa00afede3e15 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 27 Aug 2024 17:46:50 +0200
Subject: [PATCH 08/24] dynamic dtype

---
 optimum/onnxruntime/modeling_diffusion.py | 98 ++++++++++++++---------
 1 file changed, 61 insertions(+), 37 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 982dd123343..7445f1c6eff 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -53,6 +53,7 @@
 from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
+from transformers.modeling_outputs import ModelOutput
 
 import onnxruntime as ort
 
@@ -72,9 +73,9 @@
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
+from .io_binding import TypeHelper
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .utils import (
-    _ORT_TO_NP_TYPE,
     ONNX_WEIGHTS_NAME,
     get_provider_for_device,
     parse_device,
@@ -501,14 +502,23 @@ class _ORTDiffusionModelPart:
 
     CONFIG_NAME = "config.json"
 
+    _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs
+    _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs
+
     def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
         self.session = session
         self.parent_model = parent_model
-        self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
-        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         config_path = Path(session._model_path).parent / self.CONFIG_NAME
         self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()}
+        self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
+        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
+        self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()}
+        self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()}
+
+    @property
+    def input_dtype(self):
+        # for backward compatibility
+        return {key: TypeHelper.ort_type_to_numpy_type(value) for key, value in self.input_dtypes.items()}
 
     @property
     def device(self):
@@ -523,12 +533,16 @@ def __call__(self, *args, **kwargs):
 
 
 class ORTModelTextEncoder(_ORTDiffusionModelPart):
-    def forward(self, input_ids: np.ndarray):
-        onnx_inputs = {
-            "input_ids": input_ids,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+    def forward(self, input_ids: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(input_ids, torch.Tensor)
+
+        model_inputs = {"input_ids": input_ids}
+
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
 
 
 class ORTModelUnet(_ORTDiffusionModelPart):
@@ -537,45 +551,55 @@ def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
 
     def forward(
         self,
-        sample: np.ndarray,
-        timestep: np.ndarray,
-        encoder_hidden_states: np.ndarray,
-        text_embeds: Optional[np.ndarray] = None,
-        time_ids: Optional[np.ndarray] = None,
-        timestep_cond: Optional[np.ndarray] = None,
+        sample: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
     ):
-        onnx_inputs = {
+        use_torch = isinstance(sample, torch.Tensor)
+
+        model_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
+            "text_embeds": text_embeds,
+            "time_ids": time_ids,
+            "timestep_cond": timestep_cond,
         }
 
-        if text_embeds is not None:
-            onnx_inputs["text_embeds"] = text_embeds
-        if time_ids is not None:
-            onnx_inputs["time_ids"] = time_ids
-        if timestep_cond is not None:
-            onnx_inputs["timestep_cond"] = timestep_cond
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
 
 
 class ORTModelVaeDecoder(_ORTDiffusionModelPart):
-    def forward(self, latent_sample: np.ndarray):
-        onnx_inputs = {
-            "latent_sample": latent_sample,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+    def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(latent_sample, torch.Tensor)
+
+        model_inputs = {"latent_sample": latent_sample}
+
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
 
 
 class ORTModelVaeEncoder(_ORTDiffusionModelPart):
-    def forward(self, sample: np.ndarray):
-        onnx_inputs = {
-            "sample": sample,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+    def forward(self, sample: Union[np.ndarray, torch.Tensor]):
+        use_torch = isinstance(sample, torch.Tensor)
+
+        model_inputs = {"sample": sample}
+
+        onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        return ModelOutput(**model_outputs)
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)

From 7803ef311e6efedcebf2220d8290d8216652d022 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 27 Aug 2024 17:50:36 +0200
Subject: [PATCH 09/24] support torch random numbers generator

---
 .../diffusers/pipeline_latent_consistency.py  |  6 +-
 .../diffusers/pipeline_stable_diffusion.py    | 16 ++++--
 .../pipeline_stable_diffusion_img2img.py      | 56 +++++++++++++++----
 .../pipeline_stable_diffusion_inpaint.py      | 22 +++++---
 .../diffusers/pipeline_stable_diffusion_xl.py | 21 +++++--
 .../pipeline_stable_diffusion_xl_img2img.py   | 28 +++++++---
 optimum/pipelines/diffusers/pipeline_utils.py |  8 +--
 7 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
index 41c85b5b6ac..630d463de73 100644
--- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py
+++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
@@ -36,7 +36,7 @@ def __call__(
         original_inference_steps: int = None,
         guidance_scale: float = 8.5,
         num_images_per_prompt: int = 1,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         output_type: str = "pil",
@@ -66,7 +66,7 @@ def __call__(
                 usually at the expense of lower image quality.
             num_images_per_prompt (`int`, defaults to 1):
                 The number of images to generate per prompt.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -121,7 +121,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         prompt_embeds = self._encode_prompt(
             prompt,
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
index 98bff0de44d..6cc47fab1b9 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -189,7 +189,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             )
 
         if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
         elif latents.shape != shape:
             raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
@@ -209,7 +217,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -248,7 +256,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -303,7 +311,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
index 81a6ffa1e04..f7f0586ac90 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -16,7 +16,7 @@
 from typing import Callable, List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import deprecate
@@ -72,6 +72,43 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents], axis=0)
+
+        # add noise to latents using the timesteps
+        if isinstance(generator, np.random.RandomState):
+            noise = generator.randn(*init_latents.shape).astype(dtype)
+        elif isinstance(generator, torch.Generator):
+            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
+        else:
+            raise ValueError(
+                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                f" {type(generator)}."
+            )
+
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        ).numpy()
+
+        return init_latents
+
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__
     def __call__(
         self,
@@ -83,7 +120,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
         output_type: str = "pil",
@@ -125,7 +162,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
                 A np.random.RandomState to make generation deterministic.
             prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
@@ -168,7 +205,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -225,12 +262,8 @@ def __call__(
         timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
         timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
 
-        # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        )
-        init_latents = init_latents.numpy()
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -276,7 +309,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
index 19de793ccd0..cb3c7db96e9 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -16,7 +16,7 @@
 from typing import Callable, List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import PIL_INTERPOLATION
@@ -108,7 +108,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -200,7 +200,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -229,11 +229,19 @@ def __call__(
             width // self.vae_scale_factor,
         )
         latents_dtype = prompt_embeds.dtype
+
         if latents is None:
-            latents = generator.randn(*latents_shape).astype(latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*latents_shape).astype(latents_dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
+        elif latents.shape != latents_shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
         # prepare mask and masked_image
         mask, masked_image = prepare_mask_and_masked_image(
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
index 2a5e7bf78b0..3c210862acf 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -235,7 +235,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             )
 
         if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
         elif latents.shape != shape:
             raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
 
@@ -270,7 +278,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -315,7 +323,7 @@ def __call__(
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`)::
                 A np.random.RandomState to make generation deterministic.
             latents (`Optional[np.ndarray]`, defaults to `None`):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
@@ -383,7 +391,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -410,6 +418,7 @@ def __call__(
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
         timesteps = self.scheduler.timesteps
+        print("timesteps", timesteps)
 
         # 5. Prepare latent variables
         latents = self.prepare_latents(
@@ -440,6 +449,7 @@ def __call__(
         timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
 
         # 8. Denoising loop
+
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
@@ -475,7 +485,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
index a07903a735e..19988599b64 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -17,7 +17,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
@@ -222,7 +222,7 @@ def get_timesteps(self, num_inference_steps, strength):
         return timesteps, num_inference_steps - t_start
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
+    def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None):
         batch_size = batch_size * num_images_per_prompt
 
         if image.shape[1] == 4:
@@ -242,11 +242,22 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
             init_latents = np.concatenate([init_latents], axis=0)
 
         # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(dtype)
+        if isinstance(generator, np.random.RandomState):
+            noise = generator.randn(*init_latents.shape).astype(dtype)
+        elif isinstance(generator, torch.Generator):
+            noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype)
+        else:
+            raise ValueError(
+                f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                f" {type(generator)}."
+            )
+
         init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep)
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
         )
-        return init_latents.numpy()
+        init_latents = init_latents.numpy()
+
+        return init_latents
 
     def _get_add_time_ids(
         self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
@@ -274,7 +285,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
         latents: Optional[np.ndarray] = None,
         prompt_embeds: Optional[np.ndarray] = None,
         negative_prompt_embeds: Optional[np.ndarray] = None,
@@ -375,7 +386,7 @@ def __call__(
             batch_size = prompt_embeds.shape[0]
 
         if generator is None:
-            generator = np.random
+            generator = np.random.RandomState()
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -482,7 +493,8 @@ def __call__(
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
 
         if output_type == "latent":
             image = latents
diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py
index 869b91ffe59..e9d5986b61c 100644
--- a/optimum/pipelines/diffusers/pipeline_utils.py
+++ b/optimum/pipelines/diffusers/pipeline_utils.py
@@ -17,7 +17,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from diffusers import ConfigMixin
 from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor
@@ -206,7 +206,7 @@ def postprocess(
 
     def get_height_width(
         self,
-        image: [PIL.Image.Image, np.ndarray],
+        image: Union[PIL.Image.Image, np.ndarray],
         height: Optional[int] = None,
         width: Optional[int] = None,
     ):
@@ -264,10 +264,10 @@ def reshape(images: np.ndarray) -> np.ndarray:
     # TODO : remove after diffusers v0.21.0 release
     def resize(
         self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
         height: Optional[int] = None,
         width: Optional[int] = None,
-    ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]:
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
         """
         Resize image.
         """

From aa41f422cec94979f7ec8e330a6076640d331edf Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 27 Aug 2024 17:50:57 +0200
Subject: [PATCH 10/24] compact diffusion testing suite

---
 tests/onnxruntime/test_diffusion.py          | 818 ++++++++++---------
 tests/onnxruntime/utils_onnxruntime_tests.py |  13 +-
 2 files changed, 434 insertions(+), 397 deletions(-)

diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index 2d5ab7a7f8b..1840725299e 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import random
 import unittest
-from typing import Dict
 
 import numpy as np
 import PIL
@@ -24,12 +22,8 @@
     AutoPipelineForImage2Image,
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
-    StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
 )
 from diffusers.utils import load_image
-from diffusers.utils.testing_utils import floats_tensor
-from packaging.version import Version, parse
 from parameterized import parameterized
 from transformers.testing_utils import require_torch_gpu
 from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
@@ -45,22 +39,20 @@
     ORTStableDiffusionXLImg2ImgPipeline,
     ORTStableDiffusionXLPipeline,
 )
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
 from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
-from optimum.utils.import_utils import _diffusers_version
 from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
 
 
-if parse(_diffusers_version) > Version("0.21.4"):
-    from diffusers import LatentConsistencyModelPipeline
+def get_generator(generator_framework, seed):
+    if generator_framework == "np":
+        return np.random.RandomState(seed)
+    elif generator_framework == "pt":
+        return torch.Generator().manual_seed(seed)
+    else:
+        raise ValueError(f"Unknown generator_framework: {generator_framework}")
 
 
-def _generate_inputs(batch_size=1):
+def _generate_prompts(batch_size=1):
     inputs = {
         "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
         "num_inference_steps": 3,
@@ -70,7 +62,7 @@ def _generate_inputs(batch_size=1):
     return inputs
 
 
-def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
     if input_type == "pil":
         image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
@@ -94,16 +86,24 @@ def to_np(image):
 
 class ORTPipelineForText2ImageTest(ORTModelTestMixin):
     ARCHITECTURE_TO_ORTMODEL_CLASS = {
+        "lcm": ORTLatentConsistencyModelPipeline,
         "stable-diffusion": ORTStableDiffusionPipeline,
         "stable-diffusion-xl": ORTStableDiffusionXLPipeline,
-        "lcm": ORTLatentConsistencyModelPipeline,
     }
 
-    AUTOMODEL_CLASS = AutoPipelineForText2Image
     ORTMODEL_CLASS = ORTPipelineForText2Image
+    AUTOMODEL_CLASS = AutoPipelineForText2Image
 
     TASK = "text-to-image"
 
+    def generate_inputs(self, height=128, width=128, batch_size=1):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
     @require_diffusers
     def test_load_vanilla_model_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
@@ -131,12 +131,41 @@ def test_num_images_per_prompt(self, model_arch: str):
         self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
         self.assertEqual(pipeline.unet.config["in_channels"], 4)
 
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for num_images in [1, 3]:
+            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        if model_arch == "lcm":
+            # LCM doesn't support deterministic outputs beyond the first inference step
+            # TODO: Investigate why this is the case
+            inputs["num_inference_steps"] = 1
+
+        for output_type in ["latent", "np"]:
+            inputs["output_type"] = output_type
+
+            ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+            self.assertTrue(
+                np.allclose(ort_output, diffusers_output, atol=1e-4),
+                np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
+            )
+            self.assertEqual(ort_pipeline.device, diffusers_pipeline.device)
 
     @parameterized.expand(
         grid_parameters(
@@ -172,7 +201,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
         model_args = {"test_name": test_name, "model_arch": model_arch}
         self._setup(model_args)
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
+        height, width, batch_size = 64, 32, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
         outputs = pipeline(**inputs).images
         # Verify model devices
@@ -186,19 +215,32 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
+        height, width, batch_size = 64, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
 
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
+        ort_callback = Callback()
+        auto_callback = Callback()
 
-        inputs = self.generate_inputs(height=64, width=64)
-        pipeline(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertTrue(auto_callback.has_been_called)
+        self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps)
 
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
@@ -222,55 +264,74 @@ def test_shape(self, model_arch: str):
                     (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
                 )
 
-    @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"])
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
+        if model_arch in ["lcm"]:
+            pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs")
+
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        inputs = _generate_inputs()
-        height, width = 64, 64
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-    @parameterized.expand(["stable-diffusion"])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     def test_negative_prompt(self, model_arch: str):
+        if model_arch in ["lcm"]:
+            pytest.skip("LCM (Latent Consistency Model) does not support negative prompts")
+
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
         height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
         negative_prompt = ["This is a negative prompt"]
-        np.random.seed(0)
-        image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_slice_1 = pipeline(
+            **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED)
+        ).images[0, -3:, -3:, -1]
         prompt = inputs.pop("prompt")
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = pipeline.tokenizer(
-                p,
-                padding="max_length",
+
+        if model_arch == "stable-diffusion-xl":
+            (
+                inputs["prompt_embeds"],
+                inputs["negative_prompt_embeds"],
+                inputs["pooled_prompt_embeds"],
+                inputs["negative_pooled_prompt_embeds"],
+            ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt)
+        else:
+            text_ids = pipeline.tokenizer(
+                prompt,
                 max_length=pipeline.tokenizer.model_max_length,
+                padding="max_length",
+                return_tensors="np",
                 truncation=True,
+            ).input_ids
+            negative_text_ids = pipeline.tokenizer(
+                negative_prompt,
+                max_length=pipeline.tokenizer.model_max_length,
+                padding="max_length",
                 return_tensors="np",
-            )
-            text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
-            embeds.append(pipeline.text_encoder(text_inputs)[0])
+                truncation=True,
+            ).input_ids
+            inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0]
+            inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0]
 
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        np.random.seed(0)
-        image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
-        self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
+        image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1]
 
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["height"] = height
-        inputs["width"] = width
-        return inputs
+        self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1))
 
 
 class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
@@ -283,6 +344,19 @@ class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
 
     TASK = "image-to-image"
 
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
     @require_diffusers
     def test_load_vanilla_model_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
@@ -297,6 +371,7 @@ def test_load_vanilla_model_which_is_not_supported(self):
     def test_num_images_per_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
+
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         self.assertEqual(pipeline.vae_scale_factor, 2)
         self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
@@ -320,9 +395,11 @@ def test_num_images_per_prompt(self, model_arch: str):
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         model_args = {"test_name": test_name, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+
         height, width, batch_size = 32, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
         outputs = pipeline(**inputs).images
         # Verify model devices
         self.assertEqual(pipeline.device.type.lower(), "cuda")
@@ -342,9 +419,11 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
     def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
         model_args = {"test_name": test_name, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+
         height, width, batch_size = 32, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
         outputs = pipeline(**inputs).images
         # Verify model devices
         self.assertEqual(pipeline.device.type.lower(), "cuda")
@@ -355,26 +434,47 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
     def test_callback(self, model_arch: str):
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
-
-        pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
-        inputs = self.generate_inputs(height=64, width=64)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
+        if model_arch in ["stable-diffusion"]:
+            pytest.skip(
+                "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)"
+            )
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
 
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
     def test_shape(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        height, width, batch_size = 128, 64, 1
+
         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        height, width, batch_size = 32, 64, 1
 
         for input_type in ["np", "pil", "pt"]:
             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
@@ -392,315 +492,259 @@ def test_shape(self, model_arch: str):
                         (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
                     )
 
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images
+
+        self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        pytest.skip("Img2Img models do not support support output reproducibility for some reason")
+
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+
+class ORTPipelineForInpaintingTest(ORTModelTestMixin):
+    ARCHITECTURE_TO_ORTMODEL_CLASS = {
+        "stable-diffusion": ORTStableDiffusionInpaintPipeline,
+    }
+
+    AUTOMODEL_CLASS = AutoPipelineForInpainting
+    ORTMODEL_CLASS = ORTPipelineForInpainting
+
+    TASK = "inpainting"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        assert batch_size == 1, "Inpainting models only support batch_size=1"
+        assert input_type == "pil", "Inpainting models only support input_type='pil'"
+
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
+        )[0]
+        inputs["mask_image"] = _generate_images(
+            height=height, width=width, batch_size=1, channel=channel, input_type="pil"
+        )[0]
+
+        inputs["height"] = height
+        inputs["width"] = width
+
         return inputs
 
-    # @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
-    # @require_diffusers
-    # def test_shape(self, model_arch: str):
-    #     model_args = {"test_name": model_arch, "model_arch": model_arch}
-    #     self._setup(model_args)
-    #     height, width, batch_size = 128, 64, 1
-    #     pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-    #     if self.TASK == "image-to-image":
-    #         input_types = ["np", "pil", "pt"]
-    #     elif self.TASK == "text-to-image":
-    #         input_types = ["np"]
-    #     else:
-    #         input_types = ["pil"]
-
-    #     for input_type in input_types:
-    #         if self.TASK == "image-to-image":
-    #             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
-    #         else:
-    #             inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-    #         for output_type in ["np", "pil", "latent"]:
-    #             inputs["output_type"] = output_type
-    #             outputs = pipeline(**inputs).images
-    #             if output_type == "pil":
-    #                 self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
-    #             elif output_type == "np":
-    #                 self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-    #             else:
-    #                 self.assertEqual(
-    #                     outputs.shape,
-    #                     (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
-    #                 )
-
-
-# class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase):
-#     SUPPORTED_ARCHITECTURES = ["stable-diffusion"]
-#     ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline
-#     TASK = "image-to-image"
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_compare_diffusers_pipeline(self, model_arch: str):
-#         model_args = {"test_name": model_arch, "model_arch": model_arch}
-#         self._setup(model_args)
-#         height, width = 128, 128
-
-#         inputs = self.generate_inputs(height=height, width=width)
-#         inputs["prompt"] = "A painting of a squirrel eating a burger"
-#         inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED))
-
-#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-#         ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-#         diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-#         diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-#         self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1))
-
-
-# class ORTStableDiffusionPipelineTest(unittest.TestCase):
-#     SUPPORTED_ARCHITECTURES = [
-#         "stable-diffusion",
-#     ]
-#     ORTMODEL_CLASS = ORTStableDiffusionPipeline
-#     TASK = "text-to-image"
-
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_image_reproducibility(self, model_arch: str):
-#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-#         inputs = _generate_inputs()
-#         height, width = 64, 32
-#         np.random.seed(0)
-#         ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-#         np.random.seed(0)
-#         ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-#         ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-#         # Compare model outputs
-#         self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-#         self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     def test_negative_prompt(self, model_arch: str):
-#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-#         inputs = _generate_inputs()
-#         inputs["height"], inputs["width"] = 64, 32
-#         negative_prompt = ["This is a negative prompt"]
-#         np.random.seed(0)
-#         image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
-#         prompt = inputs.pop("prompt")
-#         embeds = []
-#         for p in [prompt, negative_prompt]:
-#             text_inputs = pipeline.tokenizer(
-#                 p,
-#                 padding="max_length",
-#                 max_length=pipeline.tokenizer.model_max_length,
-#                 truncation=True,
-#                 return_tensors="np",
-#             )
-#             text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
-#             embeds.append(pipeline.text_encoder(text_inputs)[0])
-
-#         inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-#         np.random.seed(0)
-#         image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
-#         self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
-
-
-# class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
-#     SUPPORTED_ARCHITECTURES = [
-#         "stable-diffusion-xl",
-#     ]
-#     ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
-#     TASK = "text-to-image"
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_compare_to_diffusers(self, model_arch: str):
-#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-#         self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-#         self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
-#         self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-#         self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-#         self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-#         self.assertIsInstance(ort_pipeline.config, Dict)
-
-#         pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
-#         batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-#         latents = ort_pipeline.prepare_latents(
-#             batch_size * num_images_per_prompt,
-#             ort_pipeline.unet.config["in_channels"],
-#             height,
-#             width,
-#             dtype=np.float32,
-#             generator=np.random.RandomState(0),
-#         )
-
-#         kwargs = {
-#             "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-#             "num_inference_steps": 1,
-#             "num_images_per_prompt": num_images_per_prompt,
-#             "height": height,
-#             "width": width,
-#             "guidance_rescale": 0.1,
-#         }
-
-#         for output_type in ["latent", "np"]:
-#             ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-#             self.assertIsInstance(ort_outputs, np.ndarray)
-#             with torch.no_grad():
-#                 outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-#             # Compare model outputs
-#             self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-#             # Compare model devices
-#             self.assertEqual(pipeline.device, ort_pipeline.device)
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_image_reproducibility(self, model_arch: str):
-#         pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-#         inputs = _generate_inputs()
-#         height, width = 64, 32
-#         np.random.seed(0)
-#         ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-#         np.random.seed(0)
-#         ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-#         ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-#         self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-#         self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-
-# class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
-#     SUPPORTED_ARCHITECTURES = [
-#         "stable-diffusion",
-#     ]
-#     ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline
-#     TASK = "inpainting"
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_compare_diffusers_pipeline(self, model_arch: str):
-#         model_args = {"test_name": model_arch, "model_arch": model_arch}
-#         self._setup(model_args)
-#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-#         diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch])
-#         height, width = 64, 64
-#         latents_shape = (
-#             1,
-#             ort_pipeline.vae_decoder.config["latent_channels"],
-#             height // ort_pipeline.vae_scale_factor,
-#             width // ort_pipeline.vae_scale_factor,
-#         )
-#         inputs = self.generate_inputs(height=height, width=width)
-
-#         np_latents = np.random.rand(*latents_shape).astype(np.float32)
-#         torch_latents = torch.from_numpy(np_latents)
-
-#         ort_outputs = ort_pipeline(**inputs, latents=np_latents).images
-#         self.assertEqual(ort_outputs.shape, (1, height, width, 3))
-
-#         diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images
-#         self.assertEqual(diffusers_outputs.shape, (1, height, width, 3))
-
-#         self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4))
-
-#     def generate_inputs(self, height=128, width=128, batch_size=1):
-#         inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width)
-#         inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-#         inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-#         return inputs
-
-
-# class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
-#     SUPPORTED_ARCHITECTURES = [
-#         "stable-diffusion-xl",
-#     ]
-#     ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
-#     TASK = "image-to-image"
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     def test_inference(self, model_arch: str):
-#         model_args = {"test_name": model_arch, "model_arch": model_arch}
-#         self._setup(model_args)
-#         pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-#         height, width = 128, 128
-#         inputs = self.generate_inputs(height=height, width=width)
-#         inputs["image"] = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-#             "/in_paint/overture-creations-5sI6fQgYIuo.png"
-#         ).resize((width, height))
-#         output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
-#         expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
-
-#         self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
-
-#     def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-#         inputs = _generate_inputs(batch_size=batch_size)
-#         inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-#         inputs["strength"] = 0.75
-#         return inputs
-
-
-# class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin):
-#     SUPPORTED_ARCHITECTURES = [
-#         "latent-consistency",
-#     ]
-#     ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline
-#     TASK = "text-to-image"
-
-#     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-#     @require_diffusers
-#     @unittest.skipIf(
-#         parse(_diffusers_version) <= Version("0.21.4"),
-#         "not supported with this diffusers version, needs diffusers>=v0.22.0",
-#     )
-#     def test_compare_to_diffusers(self, model_arch: str):
-#         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-#         self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-#         self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-#         self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-#         self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-#         self.assertIsInstance(ort_pipeline.config, Dict)
-
-#         pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
-#         batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-#         latents = ort_pipeline.prepare_latents(
-#             batch_size * num_images_per_prompt,
-#             ort_pipeline.unet.config["in_channels"],
-#             height,
-#             width,
-#             dtype=np.float32,
-#             generator=np.random.RandomState(0),
-#         )
-
-#         kwargs = {
-#             "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-#             "num_inference_steps": 1,
-#             "num_images_per_prompt": num_images_per_prompt,
-#             "height": height,
-#             "width": width,
-#             "guidance_scale": 8.5,
-#         }
-
-#         for output_type in ["latent", "np"]:
-#             ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-#             self.assertIsInstance(ort_outputs, np.ndarray)
-#             with torch.no_grad():
-#                 outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-#             # Compare model outputs
-#             self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-#             # Compare model devices
-#             self.assertEqual(pipeline.device, ort_pipeline.device)
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertEqual(pipeline.vae_scale_factor, 2)
+        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
+        self.assertEqual(pipeline.unet.config["in_channels"], 4)
+
+        batch_size, height = 1, 32
+        for width in [64, 32]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+            for num_images in [1, 3]:
+                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
+                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
+        )
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ort_pipe(**inputs, callback=ort_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        height, width, batch_size = 32, 64, 1
+
+        for input_type in ["pil"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["np", "pil", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                else:
+                    self.assertEqual(
+                        outputs.shape,
+                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                    )
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        latents_shape = (
+            batch_size,
+            ort_pipeline.vae_decoder.config["latent_channels"],
+            height // ort_pipeline.vae_scale_factor,
+            width // ort_pipeline.vae_scale_factor,
+        )
+
+        np_latents = np.random.rand(*latents_shape).astype(np.float32)
+        torch_latents = torch.from_numpy(np_latents)
+
+        ort_output = ort_pipeline(**inputs, latents=np_latents).images
+        diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images
+
+        self.assertTrue(
+            np.allclose(ort_output, diffusers_output, atol=1e-4),
+            np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
+        )
+
+    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
 
 
 class ImageProcessorTest(unittest.TestCase):
     def test_vae_image_processor_pt(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt"))
+        input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt"))
         input_np = to_np(input_pt)
 
         for output_type in ["np", "pil"]:
@@ -711,7 +755,7 @@ def test_vae_image_processor_pt(self):
 
     def test_vae_image_processor_np(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_np = np.stack(_create_image(height=8, width=8, input_type="np"))
+        input_np = np.stack(_generate_images(height=8, width=8, input_type="np"))
         for output_type in ["np", "pil"]:
             out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
             out_np = to_np(out)
@@ -720,7 +764,7 @@ def test_vae_image_processor_np(self):
 
     def test_vae_image_processor_pil(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil")
+        input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil")
 
         for output_type in ["np", "pil"]:
             out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index e77b9b7c20b..aa06476498e 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -213,16 +213,9 @@ def _setup(self, model_args: Dict):
                     continue
 
                 set_seed(SEED)
-                if hasattr(self, "ORTMODEL_CLASS"):
-                    onnx_model = self.ORTMODEL_CLASS.from_pretrained(
-                        model_id, **model_args, use_io_binding=False, export=True
-                    )
-                elif hasattr(self, "ORTPIPELINE_CLASS"):
-                    onnx_model = self.ORTPIPELINE_CLASS.from_pretrained(
-                        model_id, **model_args, use_io_binding=False, export=True
-                    )
-                else:
-                    raise ValueError("ORTMODEL_CLASS or ORTPIPELINE_CLASS must be defined")
+                onnx_model = self.ORTMODEL_CLASS.from_pretrained(
+                    model_id, **model_args, use_io_binding=False, export=True
+                )
 
                 model_dir = tempfile.mkdtemp(
                     prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}"

From 4837828102b2cbd876af9c9aef6f44a8d0651d5b Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 27 Aug 2024 19:02:00 +0200
Subject: [PATCH 11/24] fix

---
 tests/onnxruntime/test_diffusion.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index 1840725299e..a8b82dd7c4f 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -352,8 +352,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_
         )
 
         inputs["strength"] = 0.75
-        inputs["height"] = height
-        inputs["width"] = width
 
         return inputs
 
@@ -694,6 +692,11 @@ def test_shape(self, model_arch: str):
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        if model_arch in ["stable-diffusion"]:
+            pytest.skip(
+                "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant"
+            )
+
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
 

From 80532b3bad2e6b82b2f057672ec339cc18ab35ac Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sat, 7 Sep 2024 17:06:56 +0200
Subject: [PATCH 12/24] test

---
 optimum/onnxruntime/base.py               |  12 +-
 optimum/onnxruntime/modeling_diffusion.py | 214 +++++++++++-----------
 2 files changed, 107 insertions(+), 119 deletions(-)

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index d9877670ba8..5206edfc081 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -22,7 +22,6 @@
 
 from onnxruntime import InferenceSession
 
-from ..utils import NormalizedConfigManager
 from ..utils.logging import warn_once
 from .io_binding import TypeHelper
 from .modeling_ort import ORTModel
@@ -41,17 +40,10 @@ class ORTModelPart:
     _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs
     _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs
 
-    def __init__(
-        self,
-        session: InferenceSession,
-        parent_model: "ORTModel",
-    ):
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
         self.session = session
         self.parent_model = parent_model
-        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            self.parent_model.config.model_type
-        )(self.parent_model.config)
-        self.main_input_name = self.parent_model.main_input_name
+
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()}
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 7e998b4a897..606919ea7f4 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -17,7 +17,6 @@
 import os
 import shutil
 import warnings
-from abc import abstractmethod
 from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -41,11 +40,6 @@
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLPipeline,
 )
-from diffusers.pipelines.auto_pipeline import (
-    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
-    AUTO_INPAINT_PIPELINES_MAPPING,
-    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
-)
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
@@ -73,6 +67,7 @@
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
+from .base import ORTModelPart
 from .io_binding import TypeHelper
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .utils import (
@@ -86,25 +81,25 @@
 logger = logging.getLogger(__name__)
 
 
-class ORTDiffusionPipeline(ORTModel):
-    auto_model_class = DiffusionPipeline
-    main_input_name = "prompt"
-    base_model_prefix = "onnx_model"
+class ORTPipeline(ORTModel):
+    auto_model_class = None
+    model_type = "onnx_pipeline"
+
     config_name = "model_index.json"
     sub_component_config_name = "config.json"
 
-    # TODO: instead of having a bloated init, we should probably have an init per pipeline,
-    # so that we can easily add new pipelines without having to modify the base class
+    main_input_name = "prompt"
+
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
-        text_encoder_session: ort.InferenceSession,
         unet_session: ort.InferenceSession,
-        config: Dict[str, Any],
         tokenizer: CLIPTokenizer,
+        config: Dict[str, Any],
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         feature_extractor: Optional[CLIPFeatureExtractor] = None,
         vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_2_session: Optional[ort.InferenceSession] = None,
         tokenizer_2: Optional[CLIPTokenizer] = None,
         use_io_binding: Optional[bool] = None,
@@ -113,23 +108,28 @@ def __init__(
         """
         Args:
             vae_decoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the VAE decoder.
-            text_encoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the text encoder.
+                The ONNX Runtime inference session associated to the VAE decoder
             unet_session (`ort.InferenceSession`):
                 The ONNX Runtime inference session associated to the U-NET.
+            tokenizer (`CLIPTokenizer`):
+                Tokenizer of class
+                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+                for the text encoder.
             config (`Dict[str, Any]`):
                 A config dictionary from which the model components will be instantiated. Make sure to only load
                 configuration files of compatible classes.
-            tokenizer (`CLIPTokenizer`):
-                Tokenizer of class
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
             scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`):
                 A scheduler to be used in combination with the U-NET component to denoise the encoded image latents.
             feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`):
                 A model extracting features from generated images to be used as inputs for the `safety_checker`
             vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
                 The ONNX Runtime inference session associated to the VAE encoder.
+            text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
+                The ONNX Runtime inference session associated to the text encoder.
+            tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`):
+                Tokenizer of class
+                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+                for the second text encoder.
             use_io_binding (`Optional[bool]`, defaults to `None`):
                 Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
                 `True` if the device is CUDA, otherwise defaults to `False`.
@@ -137,7 +137,7 @@ def __init__(
                 The directory under which the model exported to ONNX was saved.
         """
         self.shared_attributes_init(
-            vae_decoder_session,
+            model=vae_decoder_session,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
         )
@@ -418,7 +418,7 @@ def _from_transformers(
         provider_options: Optional[Dict[str, Any]] = None,
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTDiffusionPipeline":
+    ) -> "ORTPipeline":
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
@@ -499,46 +499,27 @@ def _save_config(self, save_directory):
         self.save_config(save_directory)
 
 
-# TODO : Use ORTModelPart once IOBinding support is added
-class _ORTDiffusionModelPart:
-    """
-    For multi-file ONNX models, represents a part of the model.
-    It has its own `onnxruntime.InferenceSession`, and can perform a forward pass.
-    """
-
+class ORTPipelinePart(ORTModelPart):
     CONFIG_NAME = "config.json"
 
-    _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs
-    _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs
-
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
-        self.session = session
-        self.parent_model = parent_model
+    def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline):
         config_path = Path(session._model_path).parent / self.CONFIG_NAME
-        self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
-        self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
-        self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()}
-        self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()}
 
-    @property
-    def input_dtype(self):
-        # for backward compatibility
-        return {key: TypeHelper.ort_type_to_numpy_type(value) for key, value in self.input_dtypes.items()}
-
-    @property
-    def device(self):
-        return self.parent_model.device
+        if config_path.is_file():
+            # TODO: use FrozenDict
+            self.config = parent_model._dict_from_json_file(config_path)
+        else:
+            self.config = {}
 
-    @abstractmethod
-    def forward(self, *args, **kwargs):
-        pass
+        super().__init__(session, parent_model)
 
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
+    @property
+    def input_dtype(self):
+        # for backward compatibility and diffusion mixins (will be standardized in the future)
+        return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()}
 
 
-class ORTModelTextEncoder(_ORTDiffusionModelPart):
+class ORTModelTextEncoder(ORTPipelinePart):
     def forward(self, input_ids: Union[np.ndarray, torch.Tensor]):
         use_torch = isinstance(input_ids, torch.Tensor)
 
@@ -551,10 +532,7 @@ def forward(self, input_ids: Union[np.ndarray, torch.Tensor]):
         return ModelOutput(**model_outputs)
 
 
-class ORTModelUnet(_ORTDiffusionModelPart):
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
-        super().__init__(session, parent_model)
-
+class ORTModelUnet(ORTPipelinePart):
     def forward(
         self,
         sample: Union[np.ndarray, torch.Tensor],
@@ -582,7 +560,7 @@ def forward(
         return ModelOutput(**model_outputs)
 
 
-class ORTModelVaeDecoder(_ORTDiffusionModelPart):
+class ORTModelVaeDecoder(ORTPipelinePart):
     def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]):
         use_torch = isinstance(latent_sample, torch.Tensor)
 
@@ -595,7 +573,7 @@ def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]):
         return ModelOutput(**model_outputs)
 
 
-class ORTModelVaeEncoder(_ORTDiffusionModelPart):
+class ORTModelVaeEncoder(ORTPipelinePart):
     def forward(self, sample: Union[np.ndarray, torch.Tensor]):
         use_torch = isinstance(sample, torch.Tensor)
 
@@ -609,7 +587,7 @@ def forward(self, sample: Union[np.ndarray, torch.Tensor]):
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
@@ -620,7 +598,7 @@ class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipelineMi
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
@@ -631,7 +609,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
@@ -642,7 +620,7 @@ class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInp
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
@@ -652,7 +630,7 @@ class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyP
     __call__ = LatentConsistencyPipelineMixin.__call__
 
 
-class ORTStableDiffusionXLPipelineBase(ORTDiffusionPipeline):
+class ORTStableDiffusionXLPipelineBase(ORTPipeline):
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -721,6 +699,48 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
 
 
+SUPPORTED_ORT_PIPELINES = [
+    ORTStableDiffusionPipeline,
+    ORTStableDiffusionImg2ImgPipeline,
+    ORTStableDiffusionInpaintPipeline,
+    ORTLatentConsistencyModelPipeline,
+    ORTStableDiffusionXLPipeline,
+    ORTStableDiffusionXLImg2ImgPipeline,
+]
+
+
+def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True):
+    for ort_pipeline_class in SUPPORTED_ORT_PIPELINES:
+        if ort_pipeline_class.auto_model_class.__name__ == class_name:
+            return ort_pipeline_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {class_name}")
+
+
+class ORTDiffusionPipeline(ConfigMixin):
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
+
+        ort_pipeline_class = _get_pipeline_class(class_name)
+
+        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
+
+
 ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
         ("lcm", ORTLatentConsistencyModelPipeline),
@@ -742,49 +762,38 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ]
 )
 
-SUPPORTED_TASKS_MAPPINGS = [
+SUPPORTED_ORT_PIPELINES_MAPPINGS = [
     ORT_TEXT2IMAGE_PIPELINES_MAPPING,
     ORT_IMAGE2IMAGE_PIPELINES_MAPPING,
     ORT_INPAINT_PIPELINES_MAPPING,
 ]
 
 
-def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
-    def get_model(pipeline_class_name):
-        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
-            for model_name, pipeline in task_mapping.items():
+def _get_task_class(mapping, pipeline_class_name):
+    def _get_model_name(pipeline_class_name):
+        for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS:
+            for model_name, ort_pipeline in ort_pipelines_mapping.items():
                 if (
-                    pipeline.__name__ == pipeline_class_name
-                    or pipeline.auto_model_class.__name__ == pipeline_class_name
+                    ort_pipeline.__name__ == pipeline_class_name
+                    or ort_pipeline.auto_model_class.__name__ == pipeline_class_name
                 ):
                     return model_name
 
-    model_name = get_model(pipeline_class_name)
+    model_name = _get_model_name(pipeline_class_name)
 
     if model_name is not None:
         task_class = mapping.get(model_name, None)
         if task_class is not None:
             return task_class
 
-    if throw_error_if_not_exist:
-        raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")
+    raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}")
 
 
-class ORTPipelineBase(ConfigMixin):
-    config_name = "model_index.json"
-
-    ort_pipeline_mapping = None
-    auto_pipeline_mapping = None
-
-    def __init__(self, *args, **kwargs):
-        raise EnvironmentError(
-            f"{self.__class__.__name__} is designed to be instantiated "
-            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
-            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
-        )
+class ORTPipelineForTask(ConfigMixin):
+    auto_model_class = None
+    ort_pipelines_mapping = None
 
     @classmethod
-    @validate_hf_hub_args
     def from_pretrained(cls, pretrained_model_or_path, **kwargs):
         load_config_kwargs = {
             "force_download": kwargs.get("force_download", False),
@@ -795,38 +804,25 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
             "proxies": kwargs.get("proxies", None),
             "token": kwargs.get("token", None),
         }
-
         config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
         config = config[0] if isinstance(config, tuple) else config
         class_name = config["_class_name"]
 
-        ort_pipeline_cls = _get_task_class(cls.ort_pipeline_mapping, class_name)
+        ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name)
 
-        return ort_pipeline_cls.from_pretrained(pretrained_model_or_path, **kwargs)
-
-    @classmethod
-    def from_pipe(cls, **kwargs):
-        raise NotImplementedError(
-            f"from_pipe is not yet implemented for {cls.__name__}. Please use from_pretrained instead."
-        )
+        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
 
 
-class ORTPipelineForText2Image(ORTPipelineBase):
+class ORTPipelineForText2Image(ORTPipelineForTask):
     auto_model_class = AutoPipelineForText2Image
-
-    ort_pipeline_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING
-    auto_pipeline_mapping = AUTO_TEXT2IMAGE_PIPELINES_MAPPING
+    ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING
 
 
-class ORTPipelineForImage2Image(ORTPipelineBase):
+class ORTPipelineForImage2Image(ORTPipelineForTask):
     auto_model_class = AutoPipelineForImage2Image
+    ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING
 
-    ort_pipeline_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING
-    auto_pipeline_mapping = AUTO_IMAGE2IMAGE_PIPELINES_MAPPING
 
-
-class ORTPipelineForInpainting(ORTPipelineBase):
+class ORTPipelineForInpainting(ORTPipelineForTask):
     auto_model_class = AutoPipelineForInpainting
-
-    ort_pipeline_mapping = ORT_INPAINT_PIPELINES_MAPPING
-    auto_pipeline_mapping = AUTO_INPAINT_PIPELINES_MAPPING
+    ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING

From f99a058f7ea75578770808e116256348bada63ac Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sat, 7 Sep 2024 17:29:14 +0200
Subject: [PATCH 13/24] test

---
 optimum/onnxruntime/base.py               |  1 +
 optimum/onnxruntime/modeling_diffusion.py | 14 +++++++++-----
 optimum/onnxruntime/modeling_seq2seq.py   | 10 ----------
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index 5206edfc081..ccfd646ea0d 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -43,6 +43,7 @@ class ORTModelPart:
     def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
         self.session = session
         self.parent_model = parent_model
+        self.main_input_name = self.parent_model.main_input_name
 
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 606919ea7f4..0d3fa2bcc54 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -30,7 +30,6 @@
     AutoPipelineForText2Image,
     ConfigMixin,
     DDIMScheduler,
-    DiffusionPipeline,
     LatentConsistencyModelPipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
@@ -88,8 +87,6 @@ class ORTPipeline(ORTModel):
     config_name = "model_index.json"
     sub_component_config_name = "config.json"
 
-    main_input_name = "prompt"
-
     def __init__(
         self,
         vae_decoder_session: ort.InferenceSession,
@@ -592,6 +589,7 @@ class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin):
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = StableDiffusionPipeline
 
     __call__ = StableDiffusionPipelineMixin.__call__
@@ -603,6 +601,7 @@ class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipel
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = StableDiffusionImg2ImgPipeline
 
     __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
@@ -614,6 +613,7 @@ class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipel
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = StableDiffusionInpaintPipeline
 
     __call__ = StableDiffusionInpaintPipelineMixin.__call__
@@ -625,6 +625,7 @@ class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMi
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = LatentConsistencyModelPipeline
 
     __call__ = LatentConsistencyPipelineMixin.__call__
@@ -683,6 +684,7 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = StableDiffusionXLPipeline
 
     __call__ = StableDiffusionXLPipelineMixin.__call__
@@ -694,6 +696,7 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
     """
 
+    main_input_name = "prompt"
     auto_model_class = StableDiffusionXLImg2ImgPipeline
 
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
@@ -719,6 +722,8 @@ def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True):
 
 
 class ORTDiffusionPipeline(ConfigMixin):
+    config_name = "model_index.json"
+
     @classmethod
     @validate_hf_hub_args
     def from_pretrained(cls, pretrained_model_or_path, **kwargs):
@@ -790,8 +795,7 @@ def _get_model_name(pipeline_class_name):
 
 
 class ORTPipelineForTask(ConfigMixin):
-    auto_model_class = None
-    ort_pipelines_mapping = None
+    config_name = "model_index.json"
 
     @classmethod
     def from_pretrained(cls, pretrained_model_or_path, **kwargs):
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 4ce3e4707ed..fc185500d80 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -72,16 +72,6 @@
     from transformers.generation_utils import GenerationMixin
 
 
-# if check_if_transformers_greater("4.37.0"):
-#     # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin`
-#     # and it implements many new features including short and long form generation, and starts with 2 init tokens
-#     from transformers.models.whisper.generation_whisper import WhisperGenerationMixin
-# else:
-
-#     class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin):
-#         pass
-
-
 if check_if_transformers_greater("4.43.0"):
     from transformers.cache_utils import EncoderDecoderCache
 else:

From 781ede7d6a530d023bb78283336564c107e129ca Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sat, 7 Sep 2024 20:13:54 +0200
Subject: [PATCH 14/24] test

---
 optimum/onnxruntime/base.py             | 41 +++++++++--------
 optimum/onnxruntime/modeling_seq2seq.py | 58 -------------------------
 2 files changed, 22 insertions(+), 77 deletions(-)

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index ccfd646ea0d..b59c59ede7d 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -22,6 +22,7 @@
 
 from onnxruntime import InferenceSession
 
+from ..utils import NormalizedConfigManager
 from ..utils.logging import warn_once
 from .io_binding import TypeHelper
 from .modeling_ort import ORTModel
@@ -83,12 +84,18 @@ class ORTEncoder(ORTModelPart):
     Encoder part of the encoder-decoder model for ONNX Runtime inference.
     """
 
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor,
-        **kwargs,
-    ) -> BaseModelOutput:
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
+        super().__init__(session, parent_model)
+
+        config = (
+            self.parent_model.config.encoder
+            if hasattr(self.parent_model.config, "encoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput:
         use_torch = isinstance(input_ids, torch.Tensor)
         self.parent_model.raise_on_numpy_input_io_binding(use_torch)
 
@@ -131,6 +138,14 @@ def __init__(
     ):
         super().__init__(session, parent_model)
 
+        config = (
+            self.parent_model.config.encoder
+            if hasattr(self.parent_model.config, "encoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
         # TODO: make this less hacky.
         self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)]
         self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)]
@@ -146,11 +161,7 @@ def __init__(
 
         self.use_past_in_outputs = len(self.key_value_output_names) > 0
         self.use_past_in_inputs = len(self.key_value_input_names) > 0
-        self.use_fp16 = False
-        for inp in session.get_inputs():
-            if "past_key_values" in inp.name and inp.type == "tensor(float16)":
-                self.use_fp16 = True
-                break
+        self.use_fp16 = self.dtype == torch.float16
 
         # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2
         # can be used but do not support KV caching for the cross-attention key/values, see:
@@ -454,11 +465,3 @@ def prepare_inputs_for_merged(
                 cache_position = cache_position.to(self.device)
 
         return use_cache_branch_tensor, past_key_values, cache_position
-
-
-class ORTDecoder(ORTDecoderForSeq2Seq):
-    def __init__(self, *args, **kwargs):
-        logger.warning(
-            "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead."
-        )
-        super().__init__(*args, **kwargs)
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index fc185500d80..3cecadafe3e 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -46,7 +46,6 @@
 from ..onnx.utils import _get_external_data_paths
 from ..utils import check_if_transformers_greater
 from ..utils.file_utils import validate_file_exists
-from ..utils.normalized_config import NormalizedConfigManager
 from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 from .base import ORTDecoderForSeq2Seq, ORTEncoder
 from .constants import (
@@ -1155,49 +1154,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
 
-    def __init__(
-        self,
-        encoder_session: ort.InferenceSession,
-        decoder_session: ort.InferenceSession,
-        config: "PretrainedConfig",
-        onnx_paths: List[str],
-        decoder_with_past_session: Optional[ort.InferenceSession] = None,
-        use_cache: bool = True,
-        use_io_binding: Optional[bool] = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        preprocessors: Optional[List] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            encoder_session,
-            decoder_session,
-            config,
-            onnx_paths,
-            decoder_with_past_session,
-            use_cache,
-            use_io_binding,
-            model_save_dir,
-            preprocessors,
-            generation_config,
-            **kwargs,
-        )
-
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        if config.model_type == "encoder-decoder":
-            self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.encoder.model_type
-            )(config.encoder)
-
-            self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
-            if self.decoder_with_past is not None:
-                self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                    config.decoder.model_type
-                )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoder(session, self)
 
@@ -1511,20 +1467,6 @@ def __init__(
             **kwargs,
         )
 
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.encoder.model_type
-        )(config.encoder)
-
-        self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.decoder.model_type
-        )(config.decoder)
-
-        if self.decoder_with_past is not None:
-            self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoderForVisionEncoderDecoder(session, self)
 

From f0e3f2be5ccfcdb4da6bdfae32a1a5262292b699 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sat, 7 Sep 2024 20:23:21 +0200
Subject: [PATCH 15/24] use latent-consistency architecture name instead of lcm

---
 optimum/exporters/tasks.py                   |  2 +-
 optimum/onnxruntime/__init__.py              |  2 ++
 optimum/onnxruntime/modeling_diffusion.py    |  2 +-
 tests/exporters/exporters_utils.py           |  2 +-
 tests/onnxruntime/test_diffusion.py          | 12 ++++++------
 tests/onnxruntime/utils_onnxruntime_tests.py |  2 +-
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 97053040879..a489f34fb06 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -308,9 +308,9 @@ class TasksManager:
         "image-feature-extraction": "feature-extraction",
         # for backward compatibility and testing (where
         # model task and model type are still the same)
-        "lcm": "text-to-image",
         "stable-diffusion": "text-to-image",
         "stable-diffusion-xl": "text-to-image",
+        "latent-consistency": "text-to-image",
     }
 
     _CUSTOM_CLASSES = {
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 35cbf14587e..78ef2896d05 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -91,6 +91,7 @@
         "ORTPipelineForText2Image",
         "ORTPipelineForImage2Image",
         "ORTPipelineForInpainting",
+        "ORTDiffusionPipeline",
     ]
 
 
@@ -149,6 +150,7 @@
         )
     else:
         from .modeling_diffusion import (
+            ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 0d3fa2bcc54..32c64f38ef2 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -748,9 +748,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
 
 ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
     [
-        ("lcm", ORTLatentConsistencyModelPipeline),
         ("stable-diffusion", ORTStableDiffusionPipeline),
         ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
+        ("latent-consistency", ORTLatentConsistencyModelPipeline),
     ]
 )
 
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index a55c7a124df..c8a33b0be35 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -298,7 +298,7 @@
 PYTORCH_DIFFUSION_MODEL = {
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
-    "lcm": "echarlaix/tiny-random-latent-consistency",
+    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
 }
 
 PYTORCH_TIMM_MODEL = {
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index a8b82dd7c4f..a7360ab386b 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -86,7 +86,7 @@ def to_np(image):
 
 class ORTPipelineForText2ImageTest(ORTModelTestMixin):
     ARCHITECTURE_TO_ORTMODEL_CLASS = {
-        "lcm": ORTLatentConsistencyModelPipeline,
+        "latent-consistency": ORTLatentConsistencyModelPipeline,
         "stable-diffusion": ORTStableDiffusionPipeline,
         "stable-diffusion-xl": ORTStableDiffusionXLPipeline,
     }
@@ -150,8 +150,8 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
         diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        if model_arch == "lcm":
-            # LCM doesn't support deterministic outputs beyond the first inference step
+        if model_arch == "latent-consistency":
+            # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step
             # TODO: Investigate why this is the case
             inputs["num_inference_steps"] = 1
 
@@ -267,7 +267,7 @@ def test_shape(self, model_arch: str):
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
-        if model_arch in ["lcm"]:
+        if model_arch in ["latent-consistency"]:
             pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs")
 
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -288,8 +288,8 @@ def test_image_reproducibility(self, model_arch: str):
 
     @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
     def test_negative_prompt(self, model_arch: str):
-        if model_arch in ["lcm"]:
-            pytest.skip("LCM (Latent Consistency Model) does not support negative prompts")
+        if model_arch in ["latent-consistency"]:
+            pytest.skip("Latent Consistency Model (LCM) does not support negative prompts")
 
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index aa06476498e..bb6935461d7 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -108,7 +108,7 @@
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
-    "lcm": "echarlaix/tiny-random-latent-consistency",
+    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",

From 80c63d087c2c7fb537a8d9740627f9042660e9a2 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sat, 7 Sep 2024 21:32:02 +0200
Subject: [PATCH 16/24] fix

---
 optimum/onnxruntime/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index b59c59ede7d..0e54bafed78 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -139,8 +139,8 @@ def __init__(
         super().__init__(session, parent_model)
 
         config = (
-            self.parent_model.config.encoder
-            if hasattr(self.parent_model.config, "encoder")
+            self.parent_model.config.decoder
+            if hasattr(self.parent_model.config, "decoder")
             else self.parent_model.config
         )
 

From a4518f23ede32ebebcf9a2b0a4beb3e4d7ac86b4 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sun, 8 Sep 2024 10:59:01 +0200
Subject: [PATCH 17/24] add ort diffusion pipeline tests

---
 optimum/onnxruntime/modeling_diffusion.py     |  15 +-
 .../diffusers/pipeline_stable_diffusion_xl.py |   1 -
 tests/onnxruntime/test_diffusion.py           | 134 ++++++++++--------
 3 files changed, 84 insertions(+), 66 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 32c64f38ef2..18cd38c5f29 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -712,13 +712,16 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
 ]
 
 
-def _get_pipeline_class(class_name: str, throw_error_if_not_exist: bool = True):
+def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
     for ort_pipeline_class in SUPPORTED_ORT_PIPELINES:
-        if ort_pipeline_class.auto_model_class.__name__ == class_name:
+        if (
+            ort_pipeline_class.__name__ == pipeline_class_name
+            or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+        ):
             return ort_pipeline_class
 
     if throw_error_if_not_exist:
-        raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {class_name}")
+        raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}")
 
 
 class ORTDiffusionPipeline(ConfigMixin):
@@ -777,10 +780,10 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs):
 def _get_task_class(mapping, pipeline_class_name):
     def _get_model_name(pipeline_class_name):
         for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS:
-            for model_name, ort_pipeline in ort_pipelines_mapping.items():
+            for model_name, ort_pipeline_class in ort_pipelines_mapping.items():
                 if (
-                    ort_pipeline.__name__ == pipeline_class_name
-                    or ort_pipeline.auto_model_class.__name__ == pipeline_class_name
+                    ort_pipeline_class.__name__ == pipeline_class_name
+                    or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
                 ):
                     return model_name
 
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
index 3c210862acf..0407c16a77a 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -418,7 +418,6 @@ def __call__(
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
         timesteps = self.scheduler.timesteps
-        print("timesteps", timesteps)
 
         # 5. Prepare latent variables
         latents = self.prepare_latents(
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
index a7360ab386b..9f480b2d1a0 100644
--- a/tests/onnxruntime/test_diffusion.py
+++ b/tests/onnxruntime/test_diffusion.py
@@ -22,6 +22,7 @@
     AutoPipelineForImage2Image,
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
+    DiffusionPipeline,
 )
 from diffusers.utils import load_image
 from parameterized import parameterized
@@ -29,27 +30,22 @@
 from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
 
 from optimum.onnxruntime import (
-    ORTLatentConsistencyModelPipeline,
+    ORTDiffusionPipeline,
     ORTPipelineForImage2Image,
     ORTPipelineForInpainting,
     ORTPipelineForText2Image,
-    ORTStableDiffusionImg2ImgPipeline,
-    ORTStableDiffusionInpaintPipeline,
-    ORTStableDiffusionPipeline,
-    ORTStableDiffusionXLImg2ImgPipeline,
-    ORTStableDiffusionXLPipeline,
 )
 from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
 from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
 
 
-def get_generator(generator_framework, seed):
-    if generator_framework == "np":
+def get_generator(framework, seed):
+    if framework == "np":
         return np.random.RandomState(seed)
-    elif generator_framework == "pt":
+    elif framework == "pt":
         return torch.Generator().manual_seed(seed)
     else:
-        raise ValueError(f"Unknown generator_framework: {generator_framework}")
+        raise ValueError(f"Unknown framework: {framework}")
 
 
 def _generate_prompts(batch_size=1):
@@ -85,11 +81,7 @@ def to_np(image):
 
 
 class ORTPipelineForText2ImageTest(ORTModelTestMixin):
-    ARCHITECTURE_TO_ORTMODEL_CLASS = {
-        "latent-consistency": ORTLatentConsistencyModelPipeline,
-        "stable-diffusion": ORTStableDiffusionPipeline,
-        "stable-diffusion-xl": ORTStableDiffusionXLPipeline,
-    }
+    SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"]
 
     ORTMODEL_CLASS = ORTPipelineForText2Image
     AUTOMODEL_CLASS = AutoPipelineForText2Image
@@ -113,15 +105,23 @@ def test_load_vanilla_model_which_is_not_supported(self):
             f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
         )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_ort_pipeline_class_dispatch(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertIsInstance(pipeline, self.ARCHITECTURE_TO_ORTMODEL_CLASS[model_arch])
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -138,7 +138,7 @@ def test_num_images_per_prompt(self, model_arch: str):
             outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
             self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -168,9 +168,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
             self.assertEqual(ort_pipeline.device, diffusers_pipeline.device)
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
     )
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -189,9 +187,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
     )
     @require_torch_gpu
     @require_ort_rocm
@@ -210,7 +206,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
         self.assertIsInstance(outputs, np.ndarray)
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -242,7 +238,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
         self.assertTrue(auto_callback.has_been_called)
         self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps)
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_shape(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -264,7 +260,7 @@ def test_shape(self, model_arch: str):
                     (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
                 )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
         if model_arch in ["latent-consistency"]:
@@ -286,7 +282,7 @@ def test_image_reproducibility(self, model_arch: str):
             self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
             self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_negative_prompt(self, model_arch: str):
         if model_arch in ["latent-consistency"]:
             pytest.skip("Latent Consistency Model (LCM) does not support negative prompts")
@@ -335,10 +331,8 @@ def test_negative_prompt(self, model_arch: str):
 
 
 class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
-    ARCHITECTURE_TO_ORTMODEL_CLASS = {
-        "stable-diffusion": ORTStableDiffusionImg2ImgPipeline,
-        "stable-diffusion-xl": ORTStableDiffusionXLImg2ImgPipeline,
-    }
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+
     AUTOMODEL_CLASS = AutoPipelineForImage2Image
     ORTMODEL_CLASS = ORTPipelineForImage2Image
 
@@ -364,7 +358,23 @@ def test_load_vanilla_model_which_is_not_supported(self):
             f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
         )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(list(SUPPORTED_ARCHITECTURES))
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -383,9 +393,7 @@ def test_num_images_per_prompt(self, model_arch: str):
                 self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
     )
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -406,9 +414,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
     )
     @require_torch_gpu
     @require_ort_rocm
@@ -429,7 +435,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
         self.assertIsInstance(outputs, np.ndarray)
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         if model_arch in ["stable-diffusion"]:
@@ -465,7 +471,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
         self.assertTrue(ort_callback.has_been_called)
         self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_shape(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -490,7 +496,7 @@ def test_shape(self, model_arch: str):
                         (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
                     )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
         pytest.skip("Img2Img models do not support support output reproducibility for some reason")
@@ -509,7 +515,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
 
         self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
         pytest.skip("Img2Img models do not support support output reproducibility for some reason")
@@ -532,9 +538,7 @@ def test_image_reproducibility(self, model_arch: str):
 
 
 class ORTPipelineForInpaintingTest(ORTModelTestMixin):
-    ARCHITECTURE_TO_ORTMODEL_CLASS = {
-        "stable-diffusion": ORTStableDiffusionInpaintPipeline,
-    }
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion"]
 
     AUTOMODEL_CLASS = AutoPipelineForInpainting
     ORTMODEL_CLASS = ORTPipelineForInpainting
@@ -568,7 +572,23 @@ def test_load_vanilla_model_which_is_not_supported(self):
             f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
         )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -587,9 +607,7 @@ def test_num_images_per_prompt(self, model_arch: str):
                 self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["CUDAExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
     )
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -610,9 +628,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
     @parameterized.expand(
-        grid_parameters(
-            {"model_arch": list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()), "provider": ["ROCMExecutionProvider"]}
-        )
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
     )
     @require_torch_gpu
     @require_ort_rocm
@@ -633,7 +649,7 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: st
         self.assertIsInstance(outputs, np.ndarray)
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -664,7 +680,7 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None:
         self.assertTrue(ort_callback.has_been_called)
         self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_shape(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
@@ -689,7 +705,7 @@ def test_shape(self, model_arch: str):
                         (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
                     )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
         if model_arch in ["stable-diffusion"]:
@@ -724,7 +740,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
             np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4),
         )
 
-    @parameterized.expand(list(ARCHITECTURE_TO_ORTMODEL_CLASS.keys()))
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
         model_args = {"test_name": model_arch, "model_arch": model_arch}

From 9f0c7b632388274f6c451d2ee597935761198b1f Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 10 Sep 2024 11:03:44 +0200
Subject: [PATCH 18/24] added dummy objects

---
 optimum/onnxruntime/__init__.py          | 10 +++++-
 optimum/utils/dummy_diffusers_objects.py | 44 ++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 78ef2896d05..09a48ec955c 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -79,6 +79,10 @@
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        "ORTPipelineForText2Image",
+        "ORTDiffusionPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
@@ -88,9 +92,9 @@
         "ORTStableDiffusionXLPipeline",
         "ORTStableDiffusionXLImg2ImgPipeline",
         "ORTLatentConsistencyModelPipeline",
-        "ORTPipelineForText2Image",
         "ORTPipelineForImage2Image",
         "ORTPipelineForInpainting",
+        "ORTPipelineForText2Image",
         "ORTDiffusionPipeline",
     ]
 
@@ -141,7 +145,11 @@
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_diffusers_objects import (
+            ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index f6914bbcd3a..35d1ffe9fc7 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -79,3 +79,47 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
+
+
+class ORTDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])

From 56d06d467e049c7838b1b6036e2b8c65eb5d7500 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 10 Sep 2024 11:19:49 +0200
Subject: [PATCH 19/24] remove duplicate code

---
 .../pipeline_stable_diffusion_img2img.py      | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
index f7f0586ac90..a66035a789b 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -19,7 +19,6 @@
 import PIL.Image
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import deprecate
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
 
@@ -228,31 +227,7 @@ def __call__(
 
         latents_dtype = prompt_embeds.dtype
         image = image.astype(latents_dtype)
-        # encode the init image into latents and scale the latents
-        init_latents = self.vae_encoder(sample=image)[0]
-
         scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-        init_latents = scaling_factor * init_latents
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
-        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
 
         # get the original timestep using init_timestep
         offset = self.scheduler.config.get("steps_offset", 0)
@@ -274,8 +249,6 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
-        latents = init_latents
-
         t_start = max(num_inference_steps - init_timestep + offset, 0)
         timesteps = self.scheduler.timesteps[t_start:].numpy()
 

From 475efdfcca21a34fd43204e5ce3a7d5adc44c17f Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 11 Sep 2024 13:08:56 +0200
Subject: [PATCH 20/24] support testing without diffusers

---
 optimum/onnxruntime/__init__.py          | 16 +++++
 optimum/utils/dummy_diffusers_objects.py | 44 ++++++++++++
 tests/onnxruntime/test_modeling.py       | 91 ++++++++++++++----------
 3 files changed, 113 insertions(+), 38 deletions(-)

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index 09a48ec955c..a6e3c139797 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -83,6 +83,10 @@
         "ORTPipelineForInpainting",
         "ORTPipelineForText2Image",
         "ORTDiffusionPipeline",
+        "ORTModelTextEncoder",
+        "ORTModelUnet",
+        "ORTModelVaeDecoder",
+        "ORTModelVaeEncoder",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
@@ -96,6 +100,10 @@
         "ORTPipelineForInpainting",
         "ORTPipelineForText2Image",
         "ORTDiffusionPipeline",
+        "ORTModelTextEncoder",
+        "ORTModelUnet",
+        "ORTModelVaeDecoder",
+        "ORTModelVaeEncoder",
     ]
 
 
@@ -147,6 +155,10 @@
         from ..utils.dummy_diffusers_objects import (
             ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
+            ORTModelTextEncoder,
+            ORTModelUnet,
+            ORTModelVaeDecoder,
+            ORTModelVaeEncoder,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
@@ -160,6 +172,10 @@
         from .modeling_diffusion import (
             ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
+            ORTModelTextEncoder,
+            ORTModelUnet,
+            ORTModelVaeDecoder,
+            ORTModelVaeEncoder,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index 35d1ffe9fc7..f63d3a603c4 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -123,3 +123,47 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
+
+
+class ORTModelTextEncoder(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTModelVaeDecoder(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTModelVaeEncoder(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTModelUnet(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 4b44acb38ab..d8dd46e4ad2 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -71,6 +71,7 @@
     ONNX_DECODER_WITH_PAST_NAME,
     ONNX_ENCODER_NAME,
     ONNX_WEIGHTS_NAME,
+    ORTDiffusionPipeline,
     ORTModelForAudioClassification,
     ORTModelForAudioFrameClassification,
     ORTModelForAudioXVector,
@@ -89,15 +90,12 @@
     ORTModelForSpeechSeq2Seq,
     ORTModelForTokenClassification,
     ORTModelForVision2Seq,
-    ORTStableDiffusionPipeline,
-)
-from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
-from optimum.onnxruntime.modeling_diffusion import (
     ORTModelTextEncoder,
     ORTModelUnet,
     ORTModelVaeDecoder,
     ORTModelVaeEncoder,
 )
+from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
 from optimum.onnxruntime.modeling_ort import ORTModel
 from optimum.pipelines import pipeline
 from optimum.utils import (
@@ -108,7 +106,13 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
-from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm
+from optimum.utils.testing_utils import (
+    grid_parameters,
+    remove_directory,
+    require_diffusers,
+    require_hf_token,
+    require_ort_rocm,
+)
 
 
 logger = logging.get_logger()
@@ -205,12 +209,11 @@ def test_load_seq2seq_model_from_empty_cache(self):
         with self.assertRaises(Exception):
             _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True)
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_cache(self):
-        _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
+        _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
 
-        model = ORTStableDiffusionPipeline.from_pretrained(
-            self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
-        )
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
 
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
@@ -218,6 +221,7 @@ def test_load_stable_diffusion_model_from_cache(self):
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_empty_cache(self):
         dirpath = os.path.join(
             default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--")
@@ -225,9 +229,7 @@ def test_load_stable_diffusion_model_from_empty_cache(self):
         remove_directory(dirpath)
 
         with self.assertRaises(Exception):
-            _ = ORTStableDiffusionPipeline.from_pretrained(
-                self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
-            )
+            _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
 
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -300,18 +302,20 @@ def test_load_seq2seq_model_unknown_provider(self):
         with self.assertRaises(ValueError):
             ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider")
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_hub(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
         self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder)
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_load_stable_diffusion_model_cuda_provider(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(
+        model = ORTDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
         )
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
@@ -321,11 +325,12 @@ def test_load_stable_diffusion_model_cuda_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_load_stable_diffusion_model_rocm_provider(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(
+        model = ORTDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="ROCMExecutionProvider"
         )
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
@@ -335,8 +340,9 @@ def test_load_stable_diffusion_model_rocm_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_diffusers
     def test_load_stable_diffusion_model_cpu_provider(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(
+        model = ORTDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider"
         )
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
@@ -346,9 +352,10 @@ def test_load_stable_diffusion_model_cpu_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cpu"))
 
+    @require_diffusers
     def test_load_stable_diffusion_model_unknown_provider(self):
         with self.assertRaises(ValueError):
-            ORTStableDiffusionPipeline.from_pretrained(
+            ORTDiffusionPipeline.from_pretrained(
                 self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="FooExecutionProvider"
             )
 
@@ -478,12 +485,11 @@ def test_passing_session_options_seq2seq(self):
         self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3)
 
+    @require_diffusers
     def test_passing_session_options_stable_diffusion(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
-        model = ORTStableDiffusionPipeline.from_pretrained(
-            self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options
-        )
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options)
         self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3)
@@ -772,10 +778,11 @@ def test_seq2seq_model_on_rocm_ep_str(self):
         self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_passing_provider_options_stable_diffusion(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(
+        model = ORTDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
         )
         self.assertEqual(
@@ -791,7 +798,7 @@ def test_passing_provider_options_stable_diffusion(self):
         self.assertEqual(
             model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1"
         )
-        model = ORTStableDiffusionPipeline.from_pretrained(
+        model = ORTDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID,
             provider="CUDAExecutionProvider",
             provider_options={"do_copy_in_default_stream": 0},
@@ -810,8 +817,9 @@ def test_passing_provider_options_stable_diffusion(self):
             model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0"
         )
 
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
         model.to(cpu)
         self.assertEqual(model.device, cpu)
@@ -825,9 +833,9 @@ def test_stable_diffusion_model_on_cpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
-    # test string device input for to()
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu_str(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
         model.to("cpu")
         self.assertEqual(model.device, cpu)
@@ -841,10 +849,11 @@ def test_stable_diffusion_model_on_cpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         gpu = torch.device("cuda")
         model.to(gpu)
         self.assertEqual(model.device, torch.device("cuda:0"))
@@ -858,11 +867,12 @@ def test_stable_diffusion_model_on_gpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_stable_diffusion_model_on_rocm_ep(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         gpu = torch.device("cuda")
         model.to(gpu)
         self.assertEqual(model.device, torch.device("cuda:0"))
@@ -876,34 +886,35 @@ def test_stable_diffusion_model_on_rocm_ep(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_stable_diffusion_model_on_gpu_id(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to(torch.device("cuda:1"))
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to(1)
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda:1")
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-    # test string device input for to()
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu_str(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda")
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertEqual(model.unet.device, torch.device("cuda:0"))
@@ -916,11 +927,12 @@ def test_stable_diffusion_model_on_gpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_stable_diffusion_model_on_rocm_ep_str(self):
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda")
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertEqual(model.unet.device, torch.device("cuda:0"))
@@ -975,9 +987,10 @@ def test_save_seq2seq_model_without_past(self):
             self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
             self.assertTrue(CONFIG_NAME in folder_contents)
 
+    @require_diffusers
     def test_save_stable_diffusion_model(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+            model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
             model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertIn(model.config_name, folder_contents)
@@ -1050,10 +1063,11 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool):
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
             remove_directory(tmpdirname)
 
+    @require_diffusers
     def test_save_load_stable_diffusion_model_with_external_data(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
-            model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
+            model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
             model.save_pretrained(tmpdirname)
 
             # verify external data is exported
@@ -1068,7 +1082,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self):
                 self.assertIn(ONNX_WEIGHTS_NAME + "_data", folder_contents)
 
             # verify loading from local folder works
-            model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False)
+            model = ORTDiffusionPipeline.from_pretrained(tmpdirname, export=False)
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
             remove_directory(tmpdirname)
 
@@ -1180,11 +1194,12 @@ def test_push_seq2seq_model_with_external_data_to_hub(self):
             )
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
 
+    @require_diffusers
     @require_hf_token
     def test_push_stable_diffusion_model_with_external_data_to_hub(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
-            model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
+            model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
             model.save_pretrained(
                 tmpdirname + "/onnx",
                 token=os.environ.get("HF_AUTH_TOKEN", None),
@@ -1194,7 +1209,7 @@ def test_push_stable_diffusion_model_with_external_data_to_hub(self):
             )
 
             # verify loading from hub works
-            model = ORTStableDiffusionPipeline.from_pretrained(
+            model = ORTDiffusionPipeline.from_pretrained(
                 MODEL_NAMES["stable-diffusion"] + "-onnx",
                 export=False,
                 token=os.environ.get("HF_AUTH_TOKEN", None),

From e2ad89a8ca72a1a77a960b0092728553fced5ab1 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 11 Sep 2024 13:11:41 +0200
Subject: [PATCH 21/24] remove unnecessary

---
 optimum/utils/testing_utils.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 6579e230dc8..76fe9a05b13 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -84,17 +84,6 @@ def require_ort_rocm(test_case):
     )
 
 
-def require_ort_cuda(test_case):
-    """Decorator marking a test that requires CUDAExecutionProvider for ONNX Runtime."""
-    import onnxruntime as ort
-
-    providers = ort.get_available_providers()
-
-    return unittest.skipUnless("CUDAExecutionProvider" == providers[0], "test requires CUDAExecutionProvider")(
-        test_case
-    )
-
-
 def require_hf_token(test_case):
     """
     Decorator marking a test that requires huggingface hub token.

From 7b4b5bdd614694e87830ffa03749b8b0184fb48a Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Wed, 11 Sep 2024 13:53:17 +0200
Subject: [PATCH 22/24] revert

---
 tests/onnxruntime/test_modeling.py | 52 +++++++++++++++---------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index d8dd46e4ad2..edcab8b228d 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -71,7 +71,6 @@
     ONNX_DECODER_WITH_PAST_NAME,
     ONNX_ENCODER_NAME,
     ONNX_WEIGHTS_NAME,
-    ORTDiffusionPipeline,
     ORTModelForAudioClassification,
     ORTModelForAudioFrameClassification,
     ORTModelForAudioXVector,
@@ -94,6 +93,7 @@
     ORTModelUnet,
     ORTModelVaeDecoder,
     ORTModelVaeEncoder,
+    ORTStableDiffusionPipeline,
 )
 from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
 from optimum.onnxruntime.modeling_ort import ORTModel
@@ -211,9 +211,9 @@ def test_load_seq2seq_model_from_empty_cache(self):
 
     @require_diffusers
     def test_load_stable_diffusion_model_from_cache(self):
-        _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
+        _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
 
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
 
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
@@ -229,7 +229,7 @@ def test_load_stable_diffusion_model_from_empty_cache(self):
         remove_directory(dirpath)
 
         with self.assertRaises(Exception):
-            _ = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
+            _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
 
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -304,7 +304,7 @@ def test_load_seq2seq_model_unknown_provider(self):
 
     @require_diffusers
     def test_load_stable_diffusion_model_from_hub(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
         self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder)
@@ -315,7 +315,7 @@ def test_load_stable_diffusion_model_from_hub(self):
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_load_stable_diffusion_model_cuda_provider(self):
-        model = ORTDiffusionPipeline.from_pretrained(
+        model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
         )
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
@@ -330,7 +330,7 @@ def test_load_stable_diffusion_model_cuda_provider(self):
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_load_stable_diffusion_model_rocm_provider(self):
-        model = ORTDiffusionPipeline.from_pretrained(
+        model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="ROCMExecutionProvider"
         )
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
@@ -342,7 +342,7 @@ def test_load_stable_diffusion_model_rocm_provider(self):
 
     @require_diffusers
     def test_load_stable_diffusion_model_cpu_provider(self):
-        model = ORTDiffusionPipeline.from_pretrained(
+        model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider"
         )
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
@@ -355,7 +355,7 @@ def test_load_stable_diffusion_model_cpu_provider(self):
     @require_diffusers
     def test_load_stable_diffusion_model_unknown_provider(self):
         with self.assertRaises(ValueError):
-            ORTDiffusionPipeline.from_pretrained(
+            ORTStableDiffusionPipeline.from_pretrained(
                 self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="FooExecutionProvider"
             )
 
@@ -489,7 +489,7 @@ def test_passing_session_options_seq2seq(self):
     def test_passing_session_options_stable_diffusion(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options)
         self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3)
@@ -782,7 +782,7 @@ def test_seq2seq_model_on_rocm_ep_str(self):
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_passing_provider_options_stable_diffusion(self):
-        model = ORTDiffusionPipeline.from_pretrained(
+        model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
         )
         self.assertEqual(
@@ -798,7 +798,7 @@ def test_passing_provider_options_stable_diffusion(self):
         self.assertEqual(
             model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1"
         )
-        model = ORTDiffusionPipeline.from_pretrained(
+        model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID,
             provider="CUDAExecutionProvider",
             provider_options={"do_copy_in_default_stream": 0},
@@ -819,7 +819,7 @@ def test_passing_provider_options_stable_diffusion(self):
 
     @require_diffusers
     def test_stable_diffusion_model_on_cpu(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
         model.to(cpu)
         self.assertEqual(model.device, cpu)
@@ -835,7 +835,7 @@ def test_stable_diffusion_model_on_cpu(self):
 
     @require_diffusers
     def test_stable_diffusion_model_on_cpu_str(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
         model.to("cpu")
         self.assertEqual(model.device, cpu)
@@ -853,7 +853,7 @@ def test_stable_diffusion_model_on_cpu_str(self):
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         gpu = torch.device("cuda")
         model.to(gpu)
         self.assertEqual(model.device, torch.device("cuda:0"))
@@ -872,7 +872,7 @@ def test_stable_diffusion_model_on_gpu(self):
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_stable_diffusion_model_on_rocm_ep(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         gpu = torch.device("cuda")
         model.to(gpu)
         self.assertEqual(model.device, torch.device("cuda:0"))
@@ -889,21 +889,21 @@ def test_stable_diffusion_model_on_rocm_ep(self):
     @require_diffusers
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_stable_diffusion_model_on_gpu_id(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to(torch.device("cuda:1"))
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to(1)
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda:1")
         self.assertEqual(model.unet.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.text_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
@@ -914,7 +914,7 @@ def test_stable_diffusion_model_on_gpu_id(self):
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu_str(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda")
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertEqual(model.unet.device, torch.device("cuda:0"))
@@ -932,7 +932,7 @@ def test_stable_diffusion_model_on_gpu_str(self):
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
     def test_stable_diffusion_model_on_rocm_ep_str(self):
-        model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda")
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertEqual(model.unet.device, torch.device("cuda:0"))
@@ -990,7 +990,7 @@ def test_save_seq2seq_model_without_past(self):
     @require_diffusers
     def test_save_stable_diffusion_model(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model = ORTDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+            model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
             model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertIn(model.config_name, folder_contents)
@@ -1067,7 +1067,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool):
     def test_save_load_stable_diffusion_model_with_external_data(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
-            model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
+            model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
             model.save_pretrained(tmpdirname)
 
             # verify external data is exported
@@ -1082,7 +1082,7 @@ def test_save_load_stable_diffusion_model_with_external_data(self):
                 self.assertIn(ONNX_WEIGHTS_NAME + "_data", folder_contents)
 
             # verify loading from local folder works
-            model = ORTDiffusionPipeline.from_pretrained(tmpdirname, export=False)
+            model = ORTStableDiffusionPipeline.from_pretrained(tmpdirname, export=False)
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
             remove_directory(tmpdirname)
 
@@ -1199,7 +1199,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self):
     def test_push_stable_diffusion_model_with_external_data_to_hub(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
-            model = ORTDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
+            model = ORTStableDiffusionPipeline.from_pretrained(MODEL_NAMES["stable-diffusion"], export=True)
             model.save_pretrained(
                 tmpdirname + "/onnx",
                 token=os.environ.get("HF_AUTH_TOKEN", None),
@@ -1209,7 +1209,7 @@ def test_push_stable_diffusion_model_with_external_data_to_hub(self):
             )
 
             # verify loading from hub works
-            model = ORTDiffusionPipeline.from_pretrained(
+            model = ORTStableDiffusionPipeline.from_pretrained(
                 MODEL_NAMES["stable-diffusion"] + "-onnx",
                 export=False,
                 token=os.environ.get("HF_AUTH_TOKEN", None),

From 036dc46b09b43a1c189e234768b79cdbdb54c7a0 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 12 Sep 2024 12:32:57 +0200
Subject: [PATCH 23/24] style

---
 tests/onnxruntime/test_modeling.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index edcab8b228d..af3d47f29d3 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -213,7 +213,9 @@ def test_load_seq2seq_model_from_empty_cache(self):
     def test_load_stable_diffusion_model_from_cache(self):
         _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
 
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
+        model = ORTStableDiffusionPipeline.from_pretrained(
+            self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
+        )
 
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
@@ -229,7 +231,9 @@ def test_load_stable_diffusion_model_from_empty_cache(self):
         remove_directory(dirpath)
 
         with self.assertRaises(Exception):
-            _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True)
+            _ = ORTStableDiffusionPipeline.from_pretrained(
+                self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
+            )
 
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
@@ -489,7 +493,9 @@ def test_passing_session_options_seq2seq(self):
     def test_passing_session_options_stable_diffusion(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
-        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options)
+        model = ORTStableDiffusionPipeline.from_pretrained(
+            self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, session_options=options
+        )
         self.assertEqual(model.unet.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.text_encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.vae_decoder.session.get_session_options().intra_op_num_threads, 3)

From afbb9afc99c556a4dae3cbc2207f1d62e045388b Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 12 Sep 2024 16:30:10 +0200
Subject: [PATCH 24/24] remove model parts from optimum.onnxruntime

---
 optimum/onnxruntime/__init__.py          | 16 ---------
 optimum/utils/dummy_diffusers_objects.py | 44 ------------------------
 tests/onnxruntime/test_modeling.py       | 16 ++++++---
 3 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index a6e3c139797..09a48ec955c 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -83,10 +83,6 @@
         "ORTPipelineForInpainting",
         "ORTPipelineForText2Image",
         "ORTDiffusionPipeline",
-        "ORTModelTextEncoder",
-        "ORTModelUnet",
-        "ORTModelVaeDecoder",
-        "ORTModelVaeEncoder",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
@@ -100,10 +96,6 @@
         "ORTPipelineForInpainting",
         "ORTPipelineForText2Image",
         "ORTDiffusionPipeline",
-        "ORTModelTextEncoder",
-        "ORTModelUnet",
-        "ORTModelVaeDecoder",
-        "ORTModelVaeEncoder",
     ]
 
 
@@ -155,10 +147,6 @@
         from ..utils.dummy_diffusers_objects import (
             ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
-            ORTModelTextEncoder,
-            ORTModelUnet,
-            ORTModelVaeDecoder,
-            ORTModelVaeEncoder,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
@@ -172,10 +160,6 @@
         from .modeling_diffusion import (
             ORTDiffusionPipeline,
             ORTLatentConsistencyModelPipeline,
-            ORTModelTextEncoder,
-            ORTModelUnet,
-            ORTModelVaeDecoder,
-            ORTModelVaeEncoder,
             ORTPipelineForImage2Image,
             ORTPipelineForInpainting,
             ORTPipelineForText2Image,
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index f63d3a603c4..35d1ffe9fc7 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -123,47 +123,3 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
-
-
-class ORTModelTextEncoder(metaclass=DummyObject):
-    _backends = ["diffusers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["diffusers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["diffusers"])
-
-
-class ORTModelVaeDecoder(metaclass=DummyObject):
-    _backends = ["diffusers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["diffusers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["diffusers"])
-
-
-class ORTModelVaeEncoder(metaclass=DummyObject):
-    _backends = ["diffusers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["diffusers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["diffusers"])
-
-
-class ORTModelUnet(metaclass=DummyObject):
-    _backends = ["diffusers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["diffusers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["diffusers"])
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index af3d47f29d3..199b96342e7 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -89,11 +89,6 @@
     ORTModelForSpeechSeq2Seq,
     ORTModelForTokenClassification,
     ORTModelForVision2Seq,
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-    ORTStableDiffusionPipeline,
 )
 from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
 from optimum.onnxruntime.modeling_ort import ORTModel
@@ -106,6 +101,7 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
+from optimum.utils.import_utils import is_diffusers_available
 from optimum.utils.testing_utils import (
     grid_parameters,
     remove_directory,
@@ -115,6 +111,16 @@
 )
 
 
+if is_diffusers_available():
+    from optimum.onnxruntime.modeling_diffusion import (
+        ORTModelTextEncoder,
+        ORTModelUnet,
+        ORTModelVaeDecoder,
+        ORTModelVaeEncoder,
+        ORTStableDiffusionPipeline,
+    )
+
+
 logger = logging.get_logger()