From f1b708c4e29a392d84d69f820bcc45bfd89cc221 Mon Sep 17 00:00:00 2001 From: Tom Savage Date: Mon, 16 Sep 2024 09:04:31 +0100 Subject: [PATCH 01/50] Fixes detection of CuPy installed with pre-built wheels (#1965) The CuPy library ships both a source distribution (`cupy`) as well as versions containing pre-built wheels (`cupy-cuda11x`, `cupy-cuda12x`, `cupy-rocm-5-0`, `cupy-rocm-4-3`). Use of `_is_package_available` to detect CuPy only works for the source distribution of CuPy and fails when using the pre-built wheels versions. This is because the `_is_package_available` will always attempt to resolve version information (even if it's not required) and in doing so assumes that the _importable_ package name matches the _installed_ distribution name. While this is usually the case, it doesn't work for CuPy and several other libraries. ONNX Runtime for example might be installed as `onnxruntime` or `onnxruntime-gpu` and thus Optimum just uses `importlib.util.find_spec` to work around the same problem. This commit replicates the same solution for CuPy. --- optimum/onnxruntime/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index ad40af92b9d..985980e31b0 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -13,6 +13,7 @@ # limitations under the License. """Utility functions, classes and constants for ONNX Runtime.""" +import importlib import os import re from enum import Enum @@ -31,7 +32,6 @@ import onnxruntime as ort from ..exporters.onnx import OnnxConfig, OnnxConfigWithLoss -from ..utils.import_utils import _is_package_available if TYPE_CHECKING: @@ -91,9 +91,11 @@ def is_onnxruntime_training_available(): def is_cupy_available(): """ - Checks if onnxruntime-training is available. + Checks if CuPy is available. """ - return _is_package_available("cupy") + # Don't use _is_package_available as it doesn't work with CuPy installed + # with `cupy-cuda*` and `cupy-rocm-*` package name (prebuilt wheels). + return importlib.util.find_spec("cupy") is not None class ORTConfigManager: From ca36fc4f66577cd4ac2e6cedcc204d830a1f4985 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:08:34 +0200 Subject: [PATCH 02/50] Adding `ORTPipelineForxxx` entrypoints (#1960) * created auto task mappings * added correct auto classes * created auto task mappings * added correct auto classes * added ort/auto diffusion classes * fix ORTPipeline detection * start test refactoring * dynamic dtype * support torch random numbers generator * compact diffusion testing suite * fix * test * test * test * use latent-consistency architecture name instead of lcm * fix * add ort diffusion pipeline tests * added dummy objects * remove duplicate code * support testing without diffusers * remove unnecessary * revert * style * remove model parts from optimum.onnxruntime --- optimum/exporters/tasks.py | 2 +- optimum/modeling_base.py | 9 +- optimum/onnxruntime/__init__.py | 16 + optimum/onnxruntime/base.py | 50 +- optimum/onnxruntime/modeling_diffusion.py | 338 ++++++-- optimum/onnxruntime/modeling_seq2seq.py | 68 -- .../diffusers/pipeline_latent_consistency.py | 6 +- .../diffusers/pipeline_stable_diffusion.py | 16 +- .../pipeline_stable_diffusion_img2img.py | 83 +- .../pipeline_stable_diffusion_inpaint.py | 22 +- .../diffusers/pipeline_stable_diffusion_xl.py | 20 +- .../pipeline_stable_diffusion_xl_img2img.py | 28 +- optimum/pipelines/diffusers/pipeline_utils.py | 8 +- optimum/utils/dummy_diffusers_objects.py | 44 + tests/exporters/exporters_utils.py | 2 +- tests/onnxruntime/test_diffusion.py | 793 ++++++++++++++++++ tests/onnxruntime/test_modeling.py | 47 +- .../test_stable_diffusion_pipeline.py | 562 ------------- 18 files changed, 1287 insertions(+), 827 deletions(-) create mode 100644 tests/onnxruntime/test_diffusion.py delete mode 100644 tests/onnxruntime/test_stable_diffusion_pipeline.py diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 97053040879..a489f34fb06 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -308,9 +308,9 @@ class TasksManager: "image-feature-extraction": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) - "lcm": "text-to-image", "stable-diffusion": "text-to-image", "stable-diffusion-xl": "text-to-image", + "latent-consistency": "text-to-image", } _CUSTOM_CLASSES = { diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 5bab0622de4..3da2d9d0d21 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -85,7 +85,6 @@ class PreTrainedModel(ABC): # noqa: F811 class OptimizedModel(PreTrainedModel): config_class = AutoConfig - load_tf_weights = None base_model_prefix = "optimized_model" config_name = CONFIG_NAME @@ -378,10 +377,14 @@ def from_pretrained( ) model_id, revision = model_id.split("@") - library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token) + library_name = TasksManager.infer_library_from_model( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if library_name == "timm": - config = PretrainedConfig.from_pretrained(model_id, subfolder, revision) + config = PretrainedConfig.from_pretrained( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if config is None: if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index f1d4f63a9ff..09a48ec955c 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,6 +79,10 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + "ORTPipelineForText2Image", + "ORTDiffusionPipeline", ] else: _import_structure["modeling_diffusion"] = [ @@ -88,6 +92,10 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + "ORTPipelineForText2Image", + "ORTDiffusionPipeline", ] @@ -137,7 +145,11 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, @@ -146,7 +158,11 @@ ) else: from .modeling_diffusion import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index d9877670ba8..0e54bafed78 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -41,17 +41,11 @@ class ORTModelPart: _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs - def __init__( - self, - session: InferenceSession, - parent_model: "ORTModel", - ): + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): self.session = session self.parent_model = parent_model - self.normalized_config = NormalizedConfigManager.get_normalized_config_class( - self.parent_model.config.model_type - )(self.parent_model.config) self.main_input_name = self.parent_model.main_input_name + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} @@ -90,12 +84,18 @@ class ORTEncoder(ORTModelPart): Encoder part of the encoder-decoder model for ONNX Runtime inference. """ - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: torch.LongTensor, - **kwargs, - ) -> BaseModelOutput: + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): + super().__init__(session, parent_model) + + config = ( + self.parent_model.config.encoder + if hasattr(self.parent_model.config, "encoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + + def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput: use_torch = isinstance(input_ids, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) @@ -138,6 +138,14 @@ def __init__( ): super().__init__(session, parent_model) + config = ( + self.parent_model.config.decoder + if hasattr(self.parent_model.config, "decoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + # TODO: make this less hacky. self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)] self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)] @@ -153,11 +161,7 @@ def __init__( self.use_past_in_outputs = len(self.key_value_output_names) > 0 self.use_past_in_inputs = len(self.key_value_input_names) > 0 - self.use_fp16 = False - for inp in session.get_inputs(): - if "past_key_values" in inp.name and inp.type == "tensor(float16)": - self.use_fp16 = True - break + self.use_fp16 = self.dtype == torch.float16 # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2 # can be used but do not support KV caching for the cross-attention key/values, see: @@ -461,11 +465,3 @@ def prepare_inputs_for_merged( cache_position = cache_position.to(self.device) return use_cache_branch_tensor, past_key_values, cache_position - - -class ORTDecoder(ORTDecoderForSeq2Seq): - def __init__(self, *args, **kwargs): - logger.warning( - "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead." - ) - super().__init__(*args, **kwargs) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 4bbfb2eda2a..18cd38c5f29 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -17,7 +17,7 @@ import os import shutil import warnings -from abc import abstractmethod +from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Optional, Union @@ -25,18 +25,28 @@ import numpy as np import torch from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + ConfigMixin, DDIMScheduler, + LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLPipeline, ) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import validate_hf_hub_args from transformers import CLIPFeatureExtractor, CLIPTokenizer from transformers.file_utils import add_end_docstrings +from transformers.modeling_outputs import ModelOutput import onnxruntime as ort @@ -56,9 +66,10 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) +from .base import ORTModelPart +from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( - _ORT_TO_NP_TYPE, ONNX_WEIGHTS_NAME, get_provider_for_device, parse_device, @@ -69,23 +80,23 @@ logger = logging.getLogger(__name__) -class ORTStableDiffusionPipelineBase(ORTModel): - auto_model_class = StableDiffusionPipeline - main_input_name = "input_ids" - base_model_prefix = "onnx_model" +class ORTPipeline(ORTModel): + auto_model_class = None + model_type = "onnx_pipeline" + config_name = "model_index.json" sub_component_config_name = "config.json" def __init__( self, vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, unet_session: ort.InferenceSession, - config: Dict[str, Any], tokenizer: CLIPTokenizer, + config: Dict[str, Any], scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], feature_extractor: Optional[CLIPFeatureExtractor] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, + text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, tokenizer_2: Optional[CLIPTokenizer] = None, use_io_binding: Optional[bool] = None, @@ -94,23 +105,28 @@ def __init__( """ Args: vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder. - text_encoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the text encoder. + The ONNX Runtime inference session associated to the VAE decoder unet_session (`ort.InferenceSession`): The ONNX Runtime inference session associated to the U-NET. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the text encoder. config (`Dict[str, Any]`): A config dictionary from which the model components will be instantiated. Make sure to only load configuration files of compatible classes. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): A model extracting features from generated images to be used as inputs for the `safety_checker` vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): The ONNX Runtime inference session associated to the VAE encoder. + text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): + The ONNX Runtime inference session associated to the text encoder. + tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the second text encoder. use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`. @@ -118,7 +134,7 @@ def __init__( The directory under which the model exported to ONNX was saved. """ self.shared_attributes_init( - vae_decoder_session, + model=vae_decoder_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) @@ -350,9 +366,9 @@ def _from_pretrained( text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=new_model_save_dir - / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER - / text_encoder_2_file_name, + text_encoder_2_path=( + new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name + ), provider=provider, session_options=session_options, provider_options=provider_options, @@ -399,7 +415,7 @@ def _from_transformers( provider_options: Optional[Dict[str, Any]] = None, use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTStableDiffusionPipeline": + ) -> "ORTPipeline": if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", @@ -480,131 +496,142 @@ def _save_config(self, save_directory): self.save_config(save_directory) -# TODO : Use ORTModelPart once IOBinding support is added -class _ORTDiffusionModelPart: - """ - For multi-file ONNX models, represents a part of the model. - It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. - """ - +class ORTPipelinePart(ORTModelPart): CONFIG_NAME = "config.json" - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - self.session = session - self.parent_model = parent_model - self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): config_path = Path(session._model_path).parent / self.CONFIG_NAME - self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()} + + if config_path.is_file(): + # TODO: use FrozenDict + self.config = parent_model._dict_from_json_file(config_path) + else: + self.config = {} + + super().__init__(session, parent_model) @property - def device(self): - return self.parent_model.device + def input_dtype(self): + # for backward compatibility and diffusion mixins (will be standardized in the future) + return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} - @abstractmethod - def forward(self, *args, **kwargs): - pass - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) +class ORTModelTextEncoder(ORTPipelinePart): + def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(input_ids, torch.Tensor) + model_inputs = {"input_ids": input_ids} -class ORTModelTextEncoder(_ORTDiffusionModelPart): - def forward(self, input_ids: np.ndarray): - onnx_inputs = { - "input_ids": input_ids, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + return ModelOutput(**model_outputs) -class ORTModelUnet(_ORTDiffusionModelPart): - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - super().__init__(session, parent_model) +class ORTModelUnet(ORTPipelinePart): def forward( self, - sample: np.ndarray, - timestep: np.ndarray, - encoder_hidden_states: np.ndarray, - text_embeds: Optional[np.ndarray] = None, - time_ids: Optional[np.ndarray] = None, - timestep_cond: Optional[np.ndarray] = None, + sample: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, + time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, ): - onnx_inputs = { + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, + "text_embeds": text_embeds, + "time_ids": time_ids, + "timestep_cond": timestep_cond, } - if text_embeds is not None: - onnx_inputs["text_embeds"] = text_embeds - if time_ids is not None: - onnx_inputs["time_ids"] = time_ids - if timestep_cond is not None: - onnx_inputs["timestep_cond"] = timestep_cond - outputs = self.session.run(None, onnx_inputs) - return outputs + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(_ORTDiffusionModelPart): - def forward(self, latent_sample: np.ndarray): - onnx_inputs = { - "latent_sample": latent_sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs +class ORTModelVaeDecoder(ORTPipelinePart): + def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(latent_sample, torch.Tensor) -class ORTModelVaeEncoder(_ORTDiffusionModelPart): - def forward(self, sample: np.ndarray): - onnx_inputs = { - "sample": sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + model_inputs = {"latent_sample": latent_sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) + + +class ORTModelVaeEncoder(ORTPipelinePart): + def forward(self, sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = {"sample": sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionPipeline + __call__ = StableDiffusionPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionImg2ImgPipeline + __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionInpaintPipeline + __call__ = StableDiffusionInpaintPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ - __call__ = LatentConsistencyPipelineMixin.__call__ + main_input_name = "prompt" + auto_model_class = LatentConsistencyModelPipeline + __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase): - auto_model_class = StableDiffusionXLImg2ImgPipeline +class ORTStableDiffusionXLPipelineBase(ORTPipeline): def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -657,6 +684,9 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionXLPipeline + __call__ = StableDiffusionXLPipelineMixin.__call__ @@ -666,4 +696,140 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionXLImg2ImgPipeline + __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + + +SUPPORTED_ORT_PIPELINES = [ + ORTStableDiffusionPipeline, + ORTStableDiffusionImg2ImgPipeline, + ORTStableDiffusionInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTStableDiffusionXLPipeline, + ORTStableDiffusionXLImg2ImgPipeline, +] + + +def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): + for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: + if ( + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return ort_pipeline_class + + if throw_error_if_not_exist: + raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") + + +class ORTDiffusionPipeline(ConfigMixin): + config_name = "model_index.json" + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] + + ort_pipeline_class = _get_pipeline_class(class_name) + + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) + + +ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), + ("latent-consistency", ORTLatentConsistencyModelPipeline), + ] +) + +ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ] +) + +ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ] +) + +SUPPORTED_ORT_PIPELINES_MAPPINGS = [ + ORT_TEXT2IMAGE_PIPELINES_MAPPING, + ORT_IMAGE2IMAGE_PIPELINES_MAPPING, + ORT_INPAINT_PIPELINES_MAPPING, +] + + +def _get_task_class(mapping, pipeline_class_name): + def _get_model_name(pipeline_class_name): + for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: + for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): + if ( + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return model_name + + model_name = _get_model_name(pipeline_class_name) + + if model_name is not None: + task_class = mapping.get(model_name, None) + if task_class is not None: + return task_class + + raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}") + + +class ORTPipelineForTask(ConfigMixin): + config_name = "model_index.json" + + @classmethod + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] + + ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) + + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) + + +class ORTPipelineForText2Image(ORTPipelineForTask): + auto_model_class = AutoPipelineForText2Image + ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForImage2Image(ORTPipelineForTask): + auto_model_class = AutoPipelineForImage2Image + ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForInpainting(ORTPipelineForTask): + auto_model_class = AutoPipelineForInpainting + ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 4ce3e4707ed..3cecadafe3e 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -46,7 +46,6 @@ from ..onnx.utils import _get_external_data_paths from ..utils import check_if_transformers_greater from ..utils.file_utils import validate_file_exists -from ..utils.normalized_config import NormalizedConfigManager from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from .base import ORTDecoderForSeq2Seq, ORTEncoder from .constants import ( @@ -72,16 +71,6 @@ from transformers.generation_utils import GenerationMixin -# if check_if_transformers_greater("4.37.0"): -# # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin` -# # and it implements many new features including short and long form generation, and starts with 2 init tokens -# from transformers.models.whisper.generation_whisper import WhisperGenerationMixin -# else: - -# class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin): -# pass - - if check_if_transformers_greater("4.43.0"): from transformers.cache_utils import EncoderDecoderCache else: @@ -1165,49 +1154,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" - def __init__( - self, - encoder_session: ort.InferenceSession, - decoder_session: ort.InferenceSession, - config: "PretrainedConfig", - onnx_paths: List[str], - decoder_with_past_session: Optional[ort.InferenceSession] = None, - use_cache: bool = True, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - preprocessors: Optional[List] = None, - generation_config: Optional[GenerationConfig] = None, - **kwargs, - ): - super().__init__( - encoder_session, - decoder_session, - config, - onnx_paths, - decoder_with_past_session, - use_cache, - use_io_binding, - model_save_dir, - preprocessors, - generation_config, - **kwargs, - ) - - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - if config.model_type == "encoder-decoder": - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoder(session, self) @@ -1521,20 +1467,6 @@ def __init__( **kwargs, ) - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoderForVisionEncoderDecoder(session, self) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index 41c85b5b6ac..630d463de73 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -36,7 +36,7 @@ def __call__( original_inference_steps: int = None, guidance_scale: float = 8.5, num_images_per_prompt: int = 1, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -66,7 +66,7 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, defaults to 1): The number of images to generate per prompt. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -121,7 +121,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() prompt_embeds = self._encode_prompt( prompt, diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 98bff0de44d..6cc47fab1b9 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -189,7 +189,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -209,7 +217,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -248,7 +256,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -303,7 +311,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index 81a6ffa1e04..a66035a789b 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -16,10 +16,9 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import deprecate from .pipeline_stable_diffusion import StableDiffusionPipelineMixin @@ -72,6 +71,43 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + else: + init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = np.concatenate([init_latents], axis=0) + + # add noise to latents using the timesteps + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + + init_latents = self.scheduler.add_noise( + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) + ).numpy() + + return init_latents + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ def __call__( self, @@ -83,7 +119,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -125,7 +161,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not @@ -168,7 +204,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -191,31 +227,7 @@ def __call__( latents_dtype = prompt_embeds.dtype image = image.astype(latents_dtype) - # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=image)[0] - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - init_latents = scaling_factor * init_latents - - if isinstance(prompt, str): - prompt = [prompt] - if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = len(prompt) // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) - elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." - ) - else: - init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) @@ -225,12 +237,8 @@ def __call__( timesteps = self.scheduler.timesteps.numpy()[-init_timestep] timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(latents_dtype) - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() + # 5. Prepare latent variables + latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -241,8 +249,6 @@ def __call__( if accepts_eta: extra_step_kwargs["eta"] = eta - latents = init_latents - t_start = max(num_inference_steps - init_timestep + offset, 0) timesteps = self.scheduler.timesteps[t_start:].numpy() @@ -276,7 +282,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py index 19de793ccd0..cb3c7db96e9 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py @@ -16,7 +16,7 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import PIL_INTERPOLATION @@ -108,7 +108,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -200,7 +200,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -229,11 +229,19 @@ def __call__( width // self.vae_scale_factor, ) latents_dtype = prompt_embeds.dtype + if latents is None: - latents = generator.randn(*latents_shape).astype(latents_dtype) - else: - if latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*latents_shape).astype(latents_dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + elif latents.shape != latents_shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # prepare mask and masked_image mask, masked_image = prepare_mask_and_masked_image( diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 2a5e7bf78b0..0407c16a77a 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -235,7 +235,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -270,7 +278,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -315,7 +323,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -383,7 +391,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -440,6 +448,7 @@ def __call__( timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance @@ -475,7 +484,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py index a07903a735e..19988599b64 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py @@ -17,7 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput @@ -222,7 +222,7 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: @@ -242,11 +242,22 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt init_latents = np.concatenate([init_latents], axis=0) # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep) + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) ) - return init_latents.numpy() + init_latents = init_latents.numpy() + + return init_latents def _get_add_time_ids( self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype @@ -274,7 +285,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -375,7 +386,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -482,7 +493,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index 869b91ffe59..e9d5986b61c 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -17,7 +17,7 @@ from typing import List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers import ConfigMixin from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor @@ -206,7 +206,7 @@ def postprocess( def get_height_width( self, - image: [PIL.Image.Image, np.ndarray], + image: Union[PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, ): @@ -264,10 +264,10 @@ def reshape(images: np.ndarray) -> np.ndarray: # TODO : remove after diffusers v0.21.0 release def resize( self, - image: [PIL.Image.Image, np.ndarray, torch.Tensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, - ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Resize image. """ diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index f6914bbcd3a..35d1ffe9fc7 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -79,3 +79,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) + + +class ORTDiffusionPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForText2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForImage2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForInpainting(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index a55c7a124df..c8a33b0be35 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -298,7 +298,7 @@ PYTORCH_DIFFUSION_MODEL = { "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", - "lcm": "echarlaix/tiny-random-latent-consistency", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", } PYTORCH_TIMM_MODEL = { diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py new file mode 100644 index 00000000000..9f480b2d1a0 --- /dev/null +++ b/tests/onnxruntime/test_diffusion.py @@ -0,0 +1,793 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import PIL +import pytest +import torch +from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + DiffusionPipeline, +) +from diffusers.utils import load_image +from parameterized import parameterized +from transformers.testing_utils import require_torch_gpu +from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin + +from optimum.onnxruntime import ( + ORTDiffusionPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, +) +from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor +from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm + + +def get_generator(framework, seed): + if framework == "np": + return np.random.RandomState(seed) + elif framework == "pt": + return torch.Generator().manual_seed(seed) + else: + raise ValueError(f"Unknown framework: {framework}") + + +def _generate_prompts(batch_size=1): + inputs = { + "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + +def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"): + if input_type == "pil": + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ).resize((width, height)) + elif input_type == "np": + image = np.random.rand(height, width, channel) + elif input_type == "pt": + image = torch.rand((channel, height, width)) + + return [image] * batch_size + + +def to_np(image): + if isinstance(image[0], PIL.Image.Image): + return np.stack([np.array(i) for i in image], axis=0) + elif isinstance(image, torch.Tensor): + return image.cpu().numpy().transpose(0, 2, 3, 1) + return image + + +class ORTPipelineForText2ImageTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] + + ORTMODEL_CLASS = ORTPipelineForText2Image + AUTOMODEL_CLASS = AutoPipelineForText2Image + + TASK = "text-to-image" + + def generate_inputs(self, height=128, width=128, batch_size=1): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + if model_arch == "latent-consistency": + # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step + # TODO: Investigate why this is the case + inputs["num_inference_steps"] = 1 + + for output_type in ["latent", "np"]: + inputs["output_type"] = output_type + + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 64, 32, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_callback = Callback() + auto_callback = Callback() + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertTrue(auto_callback.has_been_called) + self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + if model_arch in ["latent-consistency"]: + pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_negative_prompt(self, model_arch: str): + if model_arch in ["latent-consistency"]: + pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + negative_prompt = ["This is a negative prompt"] + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + image_slice_1 = pipeline( + **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) + ).images[0, -3:, -3:, -1] + prompt = inputs.pop("prompt") + + if model_arch == "stable-diffusion-xl": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + else: + text_ids = pipeline.tokenizer( + prompt, + max_length=pipeline.tokenizer.model_max_length, + padding="max_length", + return_tensors="np", + truncation=True, + ).input_ids + negative_text_ids = pipeline.tokenizer( + negative_prompt, + max_length=pipeline.tokenizer.model_max_length, + padding="max_length", + return_tensors="np", + truncation=True, + ).input_ids + inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] + inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] + + image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] + + self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + + +class ORTPipelineForImage2ImageTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + + AUTOMODEL_CLASS = AutoPipelineForImage2Image + ORTMODEL_CLASS = ORTPipelineForImage2Image + + TASK = "image-to-image" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) + + inputs["strength"] = 0.75 + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(list(SUPPORTED_ARCHITECTURES)) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" + ) + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 32, 64, 1 + + for input_type in ["np", "pil", "pt"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +class ORTPipelineForInpaintingTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["stable-diffusion"] + + AUTOMODEL_CLASS = AutoPipelineForInpainting + ORTMODEL_CLASS = ORTPipelineForInpainting + + TASK = "inpainting" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): + assert batch_size == 1, "Inpainting models only support batch_size=1" + assert input_type == "pil", "Inpainting models only support input_type='pil'" + + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + inputs["mask_image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 32, 64, 1 + + for input_type in ["pil"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" + ) + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + latents_shape = ( + batch_size, + ort_pipeline.vae_decoder.config["latent_channels"], + height // ort_pipeline.vae_scale_factor, + width // ort_pipeline.vae_scale_factor, + ) + + np_latents = np.random.rand(*latents_shape).astype(np.float32) + torch_latents = torch.from_numpy(np_latents) + + ort_output = ort_pipeline(**inputs, latents=np_latents).images + diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +class ImageProcessorTest(unittest.TestCase): + def test_vae_image_processor_pt(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) + input_np = to_np(input_pt) + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_np(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_pil(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) + for i, o in zip(input_pil, out): + in_np = np.array(i) + out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 4b44acb38ab..199b96342e7 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -89,15 +89,8 @@ ORTModelForSpeechSeq2Seq, ORTModelForTokenClassification, ORTModelForVision2Seq, - ORTStableDiffusionPipeline, ) from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) from optimum.onnxruntime.modeling_ort import ORTModel from optimum.pipelines import pipeline from optimum.utils import ( @@ -108,7 +101,24 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm +from optimum.utils.import_utils import is_diffusers_available +from optimum.utils.testing_utils import ( + grid_parameters, + remove_directory, + require_diffusers, + require_hf_token, + require_ort_rocm, +) + + +if is_diffusers_available(): + from optimum.onnxruntime.modeling_diffusion import ( + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, + ORTStableDiffusionPipeline, + ) logger = logging.get_logger() @@ -205,6 +215,7 @@ def test_load_seq2seq_model_from_empty_cache(self): with self.assertRaises(Exception): _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True) + @require_diffusers def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching @@ -218,6 +229,7 @@ def test_load_stable_diffusion_model_from_cache(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--") @@ -300,6 +312,7 @@ def test_load_seq2seq_model_unknown_provider(self): with self.assertRaises(ValueError): ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider") + @require_diffusers def test_load_stable_diffusion_model_from_hub(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) @@ -308,6 +321,7 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_load_stable_diffusion_model_cuda_provider(self): @@ -321,6 +335,7 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -335,6 +350,7 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider" @@ -346,6 +362,7 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): ORTStableDiffusionPipeline.from_pretrained( @@ -478,6 +495,7 @@ def test_passing_session_options_seq2seq(self): self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3) + @require_diffusers def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 @@ -772,6 +790,7 @@ def test_seq2seq_model_on_rocm_ep_str(self): self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_passing_provider_options_stable_diffusion(self): @@ -810,6 +829,7 @@ def test_passing_provider_options_stable_diffusion(self): model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0" ) + @require_diffusers def test_stable_diffusion_model_on_cpu(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") @@ -825,7 +845,7 @@ def test_stable_diffusion_model_on_cpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) - # test string device input for to() + @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") @@ -841,6 +861,7 @@ def test_stable_diffusion_model_on_cpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu(self): @@ -858,6 +879,7 @@ def test_stable_diffusion_model_on_gpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -876,6 +898,7 @@ def test_stable_diffusion_model_on_rocm_ep(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") def test_stable_diffusion_model_on_gpu_id(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) @@ -899,7 +922,7 @@ def test_stable_diffusion_model_on_gpu_id(self): self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - # test string device input for to() + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu_str(self): @@ -916,6 +939,7 @@ def test_stable_diffusion_model_on_gpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -975,6 +999,7 @@ def test_save_seq2seq_model_without_past(self): self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents) self.assertTrue(CONFIG_NAME in folder_contents) + @require_diffusers def test_save_stable_diffusion_model(self): with tempfile.TemporaryDirectory() as tmpdirname: model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) @@ -1050,6 +1075,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") remove_directory(tmpdirname) + @require_diffusers def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data @@ -1180,6 +1206,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self): ) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + @require_diffusers @require_hf_token def test_push_stable_diffusion_model_with_external_data_to_hub(self): with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py deleted file mode 100644 index 44cd22ffecc..00000000000 --- a/tests/onnxruntime/test_stable_diffusion_pipeline.py +++ /dev/null @@ -1,562 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random -import unittest -from typing import Dict - -import numpy as np -import PIL -import pytest -import torch -from diffusers import ( - OnnxStableDiffusionImg2ImgPipeline, - StableDiffusionPipeline, - StableDiffusionXLPipeline, -) -from diffusers.utils import load_image -from diffusers.utils.testing_utils import floats_tensor -from packaging.version import Version, parse -from parameterized import parameterized -from transformers.testing_utils import require_torch_gpu -from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin - -from optimum.onnxruntime import ( - ORTLatentConsistencyModelPipeline, - ORTStableDiffusionImg2ImgPipeline, - ORTStableDiffusionInpaintPipeline, - ORTStableDiffusionPipeline, - ORTStableDiffusionXLImg2ImgPipeline, - ORTStableDiffusionXLPipeline, -) -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.import_utils import _diffusers_version -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm - - -if parse(_diffusers_version) > Version("0.21.4"): - from diffusers import LatentConsistencyModelPipeline - - -def _generate_inputs(batch_size=1): - inputs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - -def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): - if input_type == "pil": - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - elif input_type == "np": - image = np.random.rand(height, width, channel) - elif input_type == "pt": - image = torch.rand((channel, height, width)) - - return [image] * batch_size - - -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - -class ORTStableDiffusionPipelineBase(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @require_diffusers - def test_load_vanilla_model_which_is_not_supported(self): - with self.assertRaises(Exception) as context: - _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) - - self.assertIn( - f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_num_images_per_prompt(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_callback(self, model_arch: str): - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 - - pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 - inputs = self.generate_inputs(height=64, width=64) - pipe(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_shape(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width, batch_size = 128, 64, 1 - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - if self.TASK == "image-to-image": - input_types = ["np", "pil", "pt"] - elif self.TASK == "text-to-image": - input_types = ["np"] - else: - input_types = ["pil"] - - for input_type in input_types: - if self.TASK == "image-to-image": - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - else: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: - inputs["output_type"] = output_type - outputs = pipeline(**inputs).images - if output_type == "pil": - self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) - elif output_type == "np": - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = _generate_inputs(batch_size=batch_size) - inputs["height"] = height - inputs["width"] = width - return inputs - - -class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width = 128, 128 - - inputs = self.generate_inputs(height=height, width=width) - inputs["prompt"] = "A painting of a squirrel eating a burger" - inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) - - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ORTStableDiffusionPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - pipeline.safety_checker = None - batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32 - - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": "sailing ship in storm by Leonardo da Vinci", - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - self.assertIsInstance(ort_outputs, np.ndarray) - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - # Compare model outputs - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_negative_prompt(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - inputs["height"], inputs["width"] = 64, 32 - negative_prompt = ["This is a negative prompt"] - np.random.seed(0) - image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] - prompt = inputs.pop("prompt") - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = pipeline.tokenizer( - p, - padding="max_length", - max_length=pipeline.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) - embeds.append(pipeline.text_encoder(text_inputs)[0]) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - np.random.seed(0) - image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] - self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) - - -class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - -class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline - TASK = "inpainting" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) - height, width = 64, 64 - latents_shape = ( - 1, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) - inputs = self.generate_inputs(height=height, width=width) - - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) - - ort_outputs = ort_pipeline(**inputs, latents=np_latents).images - self.assertEqual(ort_outputs.shape, (1, height, width, 3)) - - diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images - self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) - - self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) - inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - return inputs - - -class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_inference(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - height, width = 128, 128 - inputs = self.generate_inputs(height=height, width=width) - inputs["image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] - expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080]) - - self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_create_image(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - -class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "latent-consistency", - ] - ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - @unittest.skipIf( - parse(_diffusers_version) <= Version("0.21.4"), - "not supported with this diffusers version, needs diffusers>=v0.22.0", - ) - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_scale": 8.5, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) From 2179d33a5a9539f065f3c80ca548a05859481688 Mon Sep 17 00:00:00 2001 From: yuanwu2017 Date: Wed, 18 Sep 2024 16:30:52 +0800 Subject: [PATCH 03/50] Disable the exllama on all non-cuda devices. (#2003) * Disable the exllama on all non-cuda devices. 1. Disable the exllama on all non-cuda devices. 2. Don't raise the error when running on non-cuda device. Signed-off-by: yuanwu * Refine the code Signed-off-by: yuanwu * Fix errors of make style Signed-off-by: yuanwu * Add hpu device Signed-off-by: yuanwu * Update optimum/gptq/constants.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Fix error of make style Signed-off-by: yuanwu --------- Signed-off-by: yuanwu Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/gptq/quantizer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 902af87bbb0..949d4d260df 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -546,7 +546,7 @@ def tmp(_, input, output): if self.bits == 4: # device not on gpu - if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): + if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])): if not self.disable_exllama: logger.warning( "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" @@ -589,13 +589,14 @@ def post_init_model(self, model): The input model """ if self.bits == 4 and not self.disable_exllama: - if get_device(model) == torch.device("cpu") or ( - hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) + if get_device(model).type != "cuda" or ( + hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"]) ): - raise ValueError( - "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" - ) + if not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True class StoreAttr(object): pass From bf1befdf7076c12a904eddfef167bfeb3e4fa0f2 Mon Sep 17 00:00:00 2001 From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:10:23 -0400 Subject: [PATCH 04/50] Add Parallel Cross Entropy (#2017) --- optimum/fx/parallelization/decomp.py | 2 +- .../op_registry/op_handlers.py | 35 +++- .../parallel_layers/__init__.py | 1 + .../parallelization/parallel_layers/loss.py | 163 ++++++++++++++++++ optimum/fx/parallelization/passes.py | 45 ++++- optimum/fx/parallelization/utils.py | 34 ++++ .../parallelization/test_tensor_parallel.py | 20 +-- 7 files changed, 280 insertions(+), 20 deletions(-) create mode 100644 optimum/fx/parallelization/parallel_layers/loss.py diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py index 26258d451bf..5410818e929 100644 --- a/optimum/fx/parallelization/decomp.py +++ b/optimum/fx/parallelization/decomp.py @@ -197,7 +197,7 @@ def run(self, *args, **kwargs): def decompose_and_functionalize( graph_module: GraphModule, decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(), - leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention], + leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention, F.cross_entropy], ) -> Callable: """ API to decompose and functionalize a high-level graph module. diff --git a/optimum/fx/parallelization/op_registry/op_handlers.py b/optimum/fx/parallelization/op_registry/op_handlers.py index 56b8fc16bc0..4a9c55e3764 100644 --- a/optimum/fx/parallelization/op_registry/op_handlers.py +++ b/optimum/fx/parallelization/op_registry/op_handlers.py @@ -19,7 +19,7 @@ from torch.fx import Node from ..core import Config -from ..utils import is_activation, is_embedding, is_linear +from ..utils import is_activation, is_cross_entropy, is_cross_entropy_parallel_compatible, is_embedding, is_linear class Registry: @@ -334,7 +334,16 @@ def propagate(self) -> List[int]: ndim = arg.meta["val"].ndim slice_dim = (slice_dim + ndim) % ndim if slice_dim == axis: - # slice on the parallel axis is not allowed + # slice on the parallel axis is not allowed, except it's a nop + start, stop, step = 0, arg.meta["val"].shape[axis], 1 + if len(self.node.args) > 2: + start = self.node.args[2] + elif len(self.node.args) > 3: + stop = self.node.args[3] + elif len(self.node.args) > 4: + step = self.node.args[4] + if start == 0 and stop >= arg.meta["val"].shape[axis] and step == 1: + return [axis] return [] return [axis] @@ -404,12 +413,12 @@ def propagate(self) -> List[int]: if self.node.op in ["placeholder", "get_attr"]: return [None] elif self.node.op == "output": - for node in self.node.all_input_nodes: - # TODO: allow parallelized nodes in output, and append comm ops in graph tp all-gather - # parallelized output if intructed - if self.extract_axis(node) is not None: - return [] - return [None] + # does not care about if output is being parallelized right now, because if the output is loss, + # then it must be not parallelized as long as it comes from sharded cross entropy. + # TODO: append all-gather comm ops before all parallelized output nodes if instructed. + input_arg = self.node.all_input_nodes[0] + axis = self.extract_axis(input_arg) + return [axis] elif is_linear(self.node): input_arg = self.node.all_input_nodes[0] axis = self.extract_axis(input_arg) @@ -438,6 +447,16 @@ def propagate(self) -> List[int]: return [1, None] if self.config.enable_sequence_parallel else [None] else: return [] + elif is_cross_entropy(self.node): + logits = self.node.all_input_nodes[0] + axis = self.extract_axis(logits) + if axis is None or ( + is_cross_entropy_parallel_compatible(self.node) and axis == logits.meta["val"].ndim - 1 + ): + # for cross entropy, the input logits parallel axis can only be the last axis or None + return [None] + else: + return [] elif is_activation(self.node): return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate() diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py index 9bfb13afdf6..474ae7f7eef 100644 --- a/optimum/fx/parallelization/parallel_layers/__init__.py +++ b/optimum/fx/parallelization/parallel_layers/__init__.py @@ -14,3 +14,4 @@ # limitations under the License. from .embedding import VocabParallelEmbedding from .linear import ColumnParallelLinear, RowParallelLinear +from .loss import VocabParallelCrossEntropyLoss, sharded_cross_entropy_wrapper_fn diff --git a/optimum/fx/parallelization/parallel_layers/loss.py b/optimum/fx/parallelization/parallel_layers/loss.py new file mode 100644 index 00000000000..0a11e33c08e --- /dev/null +++ b/optimum/fx/parallelization/parallel_layers/loss.py @@ -0,0 +1,163 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import wraps +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn + +from ..core import ParallelExecutionCtx + + +# Adapted from https://github.com/huggingface/nanotron/blob/main/src/nanotron/parallel/tensor_parallel/functional.py +class _ShardedCrossEntropy(torch.autograd.Function): + @staticmethod + def forward( + ctx, + sharded_logits: torch.Tensor, # (batch_size, length, sharded_hidden_size) + target: torch.Tensor, # (batch_size, length) + group: dist.ProcessGroup, + ): + # Maximum value along last dimension across all GPUs. + logits_max = torch.max(sharded_logits, dim=-1)[0] + dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=group) + # Subtract the maximum value. + sharded_logits = sharded_logits - logits_max.unsqueeze(dim=-1) + + # Get the shard's indices + sharded_hidden_size = sharded_logits.shape[-1] + rank = dist.get_rank(group) + start_index = rank * sharded_hidden_size + end_index = start_index + sharded_hidden_size + + # Create a mask of valid ids (1 means it needs to be masked). + target_mask = (target < start_index) | (target >= end_index) + masked_target = target.clone() - start_index + masked_target[target_mask] = 0 + + # Get predicted-logits = logits[target]. + # For Simplicity, we convert logits to a 2-D tensor with size + # [*, shard-size] and target to a 1-D tensor of size [*]. + logits_2d = sharded_logits.view(-1, sharded_hidden_size) + masked_target_1d = masked_target.view(-1) + arange_1d = torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device) + predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] + if predicted_logits_1d.is_contiguous(): + predicted_logits_1d = predicted_logits_1d.clone() + else: + predicted_logits_1d = predicted_logits_1d.contiguous() + predicted_logits = predicted_logits_1d.view_as(target) + predicted_logits[target_mask] = 0.0 + # All reduce is needed to get the chunks from other GPUs. + dist.all_reduce(predicted_logits, op=dist.ReduceOp.SUM, group=group) + + # Sum of exponential of logits along vocab dimension across all GPUs. + exp_logits = sharded_logits + torch.exp(sharded_logits, out=exp_logits) + sum_exp_logits = exp_logits.sum(dim=-1) + dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM, group=group) + + # Loss = log(sum(exp(logits))) - predicted-logit. + loss = torch.log(sum_exp_logits) - predicted_logits + + # Normalize and optionally smooth logits + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + + # Store softmax, target-mask and masked-target for backward pass. + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + # Retrieve tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + + # All the inputs have softmax as their gradient. + grad_input = softmax + # For simplicity, work with the 2D gradient. + sharded_hidden_size = softmax.size()[-1] + grad_2d = grad_input.view(-1, sharded_hidden_size) + + # Add the gradient from matching classes. + arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) + grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float() + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + + return grad_input, None, None + + +def sharded_cross_entropy(sharded_logits: torch.Tensor, target: torch.Tensor, process_group: dist.ProcessGroup): + return _ShardedCrossEntropy.apply(sharded_logits, target, process_group) + + +def sharded_cross_entropy_wrapper_fn(process_group: dist.ProcessGroup): + @wraps(sharded_cross_entropy) + def wrapper( + sharded_logits: torch.Tensor, + target: torch.Tensor, + weight: Optional[torch.Tensor] = None, + size_average: Optional[bool] = None, + ignore_index: int = -100, + reduce: Optional[bool] = None, + reduction: str = "mean", + label_smoothing: float = 0.0, + ): + if weight is not None or ignore_index != -100 or label_smoothing != 0.0: + raise ValueError( + "Does not support weighted mode, index ignoring and label smoothing in current parallel cross entropy implementation." + ) + loss: torch.Tensor = sharded_cross_entropy(sharded_logits, target, process_group) + + if size_average is not None or reduce is not None: + size_average = True if size_average is None else size_average + reduce = True if reduce is None else reduce + + if size_average and reduce: + reduction = "mean" + elif reduce: + reduction = "sum" + else: + reduction = "none" + + if reduction == "mean": + return loss.mean() + elif reduction == "sum": + return loss.sum() + return loss + + return wrapper + + +class VocabParallelCrossEntropyLoss(nn.Module): + """ + Simple parallel cross entropy implementation which does not support weighted mode and label smoothing yet. + """ + + def __init__(self, ctx: ParallelExecutionCtx, reduction: str = "mean") -> None: + super(VocabParallelCrossEntropyLoss, self).__init__() + self.process_group = ctx.tp_group + self.reduction = reduction + + def forward(self, sharded_logits: torch.Tensor, target: torch.Tensor): + loss: torch.Tensor = _ShardedCrossEntropy.apply(sharded_logits, target, self.process_group) + if self.reduction == "mean": + return loss.mean() + elif self.reduction == "sum": + return loss.sum() + return loss diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py index 14b652fff73..90155263281 100644 --- a/optimum/fx/parallelization/passes.py +++ b/optimum/fx/parallelization/passes.py @@ -26,8 +26,15 @@ from .decomp import decompose_and_functionalize from .distributed import scatter from .op_registry import REGISTRY, FallbackParallelAxisPropagateHandler -from .parallel_layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding +from .parallel_layers import ( + ColumnParallelLinear, + RowParallelLinear, + VocabParallelCrossEntropyLoss, + VocabParallelEmbedding, + sharded_cross_entropy_wrapper_fn, +) from .utils import ( + is_cross_entropy, is_embedding, is_linear, is_shape_consumer, @@ -273,6 +280,11 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf info["sequence_parallel"] = False self.place_marker_per_node(node, info) + elif is_cross_entropy(node): + axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis") + if axis_before is not None: + self.place_marker_per_node(node, {"axis": "vocab"}) + return graph_module @@ -343,6 +355,35 @@ def handle_embedding(node: Node, ctx: ParallelExecutionCtx) -> None: layer_cache[key] = new_mod setattr(parent_mod, field, new_mod) + @staticmethod + def handle_cross_entropy(node: Node, ctx: ParallelExecutionCtx) -> None: + axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis") + if axis is None: + return + + assert axis in {"vocab"}, "Only support parallelization on vocab dim for now." + if node.op == "call_module": + graph_module = node.graph.owning_module + prefix_and_field = node.target.rsplit(".", maxsplit=1) + if len(prefix_and_field) == 2: + parent_mod = graph_module.get_submodule(prefix_and_field[0]) + field = prefix_and_field[1] + else: + parent_mod = graph_module + field = node.target + + mod: nn.CrossEntropyLoss = graph_module.get_submodule(node.target) + key, layer_cache = node.target, ctx.parallel_layer_cache + if key in layer_cache: + new_mod = layer_cache[key] + else: + assert ctx.compile_times == 0, "illegal path for recompilation" + new_mod = VocabParallelCrossEntropyLoss(ctx, reduction=mod.reduction) + layer_cache[key] = new_mod + setattr(parent_mod, field, new_mod) + else: + node.target = sharded_cross_entropy_wrapper_fn(process_group=ctx.tp_group) + @staticmethod def handle_hard_coded_axis_param(node: Node, ctx: ParallelExecutionCtx) -> None: def extract_shape_from_node(node: Node) -> List[Any]: @@ -384,6 +425,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf self.handle_linear(node, ctx) elif is_embedding(node): self.handle_embedding(node, ctx) + elif is_cross_entropy(node): + self.handle_cross_entropy(node, ctx) # correct the attention head num in parallel setting elif is_shape_consumer(node): self.handle_hard_coded_axis_param(node, ctx) diff --git a/optimum/fx/parallelization/utils.py b/optimum/fx/parallelization/utils.py index b7b1ccd41c8..3074638737f 100644 --- a/optimum/fx/parallelization/utils.py +++ b/optimum/fx/parallelization/utils.py @@ -82,6 +82,40 @@ def is_shape_generator(node: Node) -> bool: return node.op == "call_method" and node.target == "size" +def is_cross_entropy(node: Node) -> bool: + if node.op == "call_function": + return node.target is F.cross_entropy + elif node.op == "call_module": + mod = node.graph.owning_module + return isinstance(mod.get_submodule(node.target), nn.CrossEntropyLoss) + return False + + +def is_cross_entropy_parallel_compatible(node: Node) -> bool: + """ + For now `VocabParallelCrossEntropyLoss` does not support weighted mode, index ignoring and label smoothing. + """ + if node.op == "call_function": + weight = node.kwargs.get("weight", None) + ignore_index = node.kwargs.get("ignore_index", -100) + label_smoothing = node.kwargs.get("label_smoothing", 0.0) + if len(node.args) > 2 and weight is None: + weight = node.args[2] + if len(node.args) > 4 and ignore_index == -100: + ignore_index = node.args[4] + if len(node.args) > 7 and label_smoothing == 0.0: + label_smoothing = node.args[7] + + return weight is None and ignore_index == -100 and label_smoothing == 0.0 + + elif node.op == "call_module": + mod: nn.CrossEntropyLoss = node.graph.owning_module.get_submodule(node.target) + weight, label_smoothing, ignore_index = mod.weight, mod.label_smoothing, mod.ignore_index + return weight is None and ignore_index == -100 and label_smoothing == 0.0 + + return False + + def stable_topological_sort(graph: Graph): def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]: args: List[torch.fx.node.Argument] = [] diff --git a/tests/fx/parallelization/test_tensor_parallel.py b/tests/fx/parallelization/test_tensor_parallel.py index 9626fccec3b..8a00393c4d7 100644 --- a/tests/fx/parallelization/test_tensor_parallel.py +++ b/tests/fx/parallelization/test_tensor_parallel.py @@ -36,6 +36,7 @@ "output_attentions": False, "output_hidden_states": False, "tie_word_embeddings": True, + "return_dict": True, } DUMMY_MODELS_TO_TEST = ( @@ -64,11 +65,10 @@ def prepare_dummy_inputs( seq_len: int = 10, device: Union[str, torch.device] = "cuda", ): - return { - "input_ids": torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device), - "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.int64, device=device), - "position_ids": torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, -1), - } + input_ids = torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device) + attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device) + labels = input_ids.clone() + return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, model_kwargs: Dict[str, Any]): @@ -82,8 +82,8 @@ def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, m model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs) inputs = prepare_dummy_inputs(model.config) - logits = model(**inputs)[0] - tensors = gather_at_main_process(tensor=logits, group=tp_group, rank=rank, world_size=world_size) + loss = model(**inputs).loss + tensors = gather_at_main_process(tensor=loss, group=tp_group, rank=rank, world_size=world_size) # check results at main worker process if rank == 0: @@ -145,7 +145,7 @@ def run_test_parallel_results_matches_non_parallel( inputs = prepare_dummy_inputs(model.config) set_seed(SEED) - logits = model(**inputs)[0] + loss = model(**inputs).loss torch._dynamo.reset() del model @@ -154,9 +154,9 @@ def run_test_parallel_results_matches_non_parallel( set_seed(SEED) ctx = ParallelExecutionCtx(tp_group=tp_group, current_device=device) model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs) - parallel_logits = model(**inputs)[0] + parallel_loss = model(**inputs).loss - torch.testing.assert_close(logits.cpu(), parallel_logits.cpu(), rtol=1e-4, atol=1e-4) + torch.testing.assert_close(loss.cpu(), parallel_loss.cpu(), rtol=1e-4, atol=1e-4) dist.barrier(tp_group) tearDown() From 2fb5ea5ca7ca8ea887af2851cce80ab2545d3f4f Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 18 Sep 2024 22:48:34 +0200 Subject: [PATCH 05/50] Fix `is_torch_tpu_available` in ORT Trainer (#2028) --- optimum/onnxruntime/trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 86c333adb3f..66273cbcf96 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -103,14 +103,14 @@ from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled if check_if_transformers_greater("4.39"): - from transformers.utils import is_torch_xla_available + from transformers.utils import is_torch_xla_available as is_torch_tpu_xla_available - if is_torch_xla_available(): + if is_torch_tpu_xla_available(): import torch_xla.core.xla_model as xm else: - from transformers.utils import is_torch_tpu_available + from transformers.utils import is_torch_tpu_available as is_torch_tpu_xla_available - if is_torch_tpu_available(check_device=False): + if is_torch_tpu_xla_available(check_device=False): import torch_xla.core.xla_model as xm if TYPE_CHECKING: @@ -735,7 +735,7 @@ def get_dataloader_sampler(dataloader): if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_tpu_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses From fd638d20046a73a7221083b23c69b98445e2d321 Mon Sep 17 00:00:00 2001 From: Vijay Date: Thu, 26 Sep 2024 12:59:19 +0530 Subject: [PATCH 06/50] Added image-to-image task for ORT Pipeline (#2031) * Add ORTModelForImageToImage for image-to-image task SwinSR * Added image-to-image task to optimum pipeline * Add Tests fpr ORTModelForImageToImage for image-to-image task SwinSR * Use export=True for models from transformers, self._setup and more * Code Refactor * Refactor ORTModelForImageToImageIntegrationTest --- optimum/onnxruntime/__init__.py | 2 + optimum/onnxruntime/modeling_ort.py | 73 ++++++++++ optimum/pipelines/pipelines_base.py | 8 ++ tests/onnxruntime/test_modeling.py | 136 ++++++++++++++++++- tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 5 files changed, 219 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 09a48ec955c..1cb5b7c47b9 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -44,6 +44,7 @@ "ORTModelForSemanticSegmentation", "ORTModelForSequenceClassification", "ORTModelForTokenClassification", + "ORTModelForImageToImage", ], "modeling_seq2seq": [ "ORTModelForSeq2SeqLM", @@ -112,6 +113,7 @@ ORTModelForCustomTasks, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForMultipleChoice, ORTModelForQuestionAnswering, diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 254b771e334..9166f7c2cbe 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -34,6 +34,7 @@ AutoModelForAudioXVector, AutoModelForCTC, AutoModelForImageClassification, + AutoModelForImageToImage, AutoModelForMaskedLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering, @@ -47,6 +48,7 @@ BaseModelOutput, CausalLMOutput, ImageClassifierOutput, + ImageSuperResolutionOutput, MaskedLMOutput, ModelOutput, MultipleChoiceModelOutput, @@ -2183,6 +2185,77 @@ def forward( return TokenClassifierOutput(logits=logits) +IMAGE_TO_IMAGE_EXAMPLE = r""" + Example of image-to-image (Super Resolution): + + ```python + >>> from transformers import {processor_class} + >>> from optimum.onnxruntime import {model_class} + >>> from PIL import Image + + >>> image = Image.open("path/to/image.jpg") + + >>> image_processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + ``` +""" + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTModelForImageToImage(ORTModel): + """ + ONNX Model for image-to-image tasks. This class officially supports pix2pix, cyclegan, wav2vec2, wav2vec2-conformer. + """ + + auto_model_class = AutoModelForImageToImage + + @add_start_docstrings_to_model_forward( + ONNX_IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width") + + IMAGE_TO_IMAGE_EXAMPLE.format( + processor_class=_PROCESSOR_FOR_DOC, + model_class="ORTModelForImgageToImage", + checkpoint="caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr", + ) + ) + def forward( + self, + pixel_values: Union[torch.Tensor, np.ndarray], + **kwargs, + ): + use_torch = isinstance(pixel_values, torch.Tensor) + self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: + input_shapes = pixel_values.shape + io_binding, output_shapes, output_buffers = self.prepare_io_binding( + pixel_values, + ordered_input_names=self._ordered_input_names, + known_output_shapes={ + "reconstruction": [ + input_shapes[0], + input_shapes[1], + input_shapes[2] * self.config.upscale, + input_shapes[3] * self.config.upscale, + ] + }, + ) + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() + reconstruction = output_buffers["reconstruction"].view(output_shapes["reconstruction"]) + else: + model_inputs = {"pixel_values": pixel_values} + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + reconstruction = model_outputs["reconstruction"] + return ImageSuperResolutionOutput(reconstruction=reconstruction) + + CUSTOM_TASKS_EXAMPLE = r""" Example of custom tasks(e.g. a sentence transformers taking `pooler_output` as output): diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py index a08ab8782a3..7690143f13f 100644 --- a/optimum/pipelines/pipelines_base.py +++ b/optimum/pipelines/pipelines_base.py @@ -24,6 +24,7 @@ FillMaskPipeline, ImageClassificationPipeline, ImageSegmentationPipeline, + ImageToImagePipeline, ImageToTextPipeline, Pipeline, PreTrainedTokenizer, @@ -55,6 +56,7 @@ ORTModelForCausalLM, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForQuestionAnswering, ORTModelForSemanticSegmentation, @@ -157,6 +159,12 @@ "default": "superb/hubert-base-superb-ks", "type": "audio", }, + "image-to-image": { + "impl": ImageToImagePipeline, + "class": (ORTModelForImageToImage,), + "default": "caidas/swin2SR-classical-sr-x2-64", + "type": "image", + }, } else: ORT_SUPPORTED_TASKS = {} diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 199b96342e7..f6771ce7618 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -42,6 +42,7 @@ AutoModelForCausalLM, AutoModelForCTC, AutoModelForImageClassification, + AutoModelForImageToImage, AutoModelForMaskedLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering, @@ -57,7 +58,9 @@ PretrainedConfig, set_seed, ) +from transformers.modeling_outputs import ImageSuperResolutionOutput from transformers.modeling_utils import no_init_weights +from transformers.models.swin2sr.configuration_swin2sr import Swin2SRConfig from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin @@ -79,6 +82,7 @@ ORTModelForCustomTasks, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForMultipleChoice, ORTModelForPix2Struct, @@ -4704,6 +4708,136 @@ def test_compare_generation_to_io_binding( gc.collect() +class ORTModelForImageToImageIntegrationTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["swin2sr"] + + ORTMODEL_CLASS = ORTModelForImageToImage + + TASK = "image-to-image" + + def _get_sample_image(self): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + def _get_preprocessors(self, model_id): + image_processor = AutoImageProcessor.from_pretrained(model_id) + + return image_processor + + def test_load_vanilla_transformers_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = ORTModelForImageToImage.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn("only supports the tasks", str(context.exception)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertIsInstance(onnx_model.config, Swin2SRConfig) + set_seed(SEED) + + transformers_model = AutoModelForImageToImage.from_pretrained(model_id) + image_processor = self._get_preprocessors(model_id) + + data = self._get_sample_image() + features = image_processor(data, return_tensors="pt") + + with torch.no_grad(): + transformers_outputs = transformers_model(**features) + + onnx_outputs = onnx_model(**features) + self.assertIsInstance(onnx_outputs, ImageSuperResolutionOutput) + self.assertTrue("reconstruction" in onnx_outputs) + self.assertIsInstance(onnx_outputs.reconstruction, torch.Tensor) + self.assertTrue(torch.allclose(onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=1e-4)) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_generate_utils(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + + data = self._get_sample_image() + features = image_processor(data, return_tensors="pt") + + outputs = onnx_model(**features) + self.assertIsInstance(outputs, ImageSuperResolutionOutput) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_image_to_image(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + ) + data = self._get_sample_image() + outputs = pipe(data) + self.assertEqual(pipe.device, onnx_model.device) + self.assertIsInstance(outputs, Image.Image) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_torch_gpu + @pytest.mark.cuda_ep_test + def test_pipeline_on_gpu(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + device=0, + ) + + data = self._get_sample_image() + outputs = pipe(data) + + self.assertEqual(pipe.model.device.type.lower(), "cuda") + self.assertIsInstance(outputs, Image.Image) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + def test_pipeline_on_rocm(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + device=0, + ) + + data = self._get_sample_image() + outputs = pipe(data) + + self.assertEqual(pipe.model.device.type.lower(), "cuda") + self.assertIsInstance(outputs, Image.Image) + + class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr", "donut"] @@ -4831,7 +4965,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0]) ) for i in range(len(onnx_outputs["past_key_values"])): - print(onnx_outputs["past_key_values"][i]) for ort_pkv, trfs_pkv in zip( onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i] ): @@ -5517,6 +5650,7 @@ class TestBothExportersORTModel(unittest.TestCase): ["automatic-speech-recognition", ORTModelForCTCIntegrationTest], ["audio-xvector", ORTModelForAudioXVectorIntegrationTest], ["audio-frame-classification", ORTModelForAudioFrameClassificationIntegrationTest], + ["image-to-image", ORTModelForImageToImageIntegrationTest], ] ) def test_find_untested_architectures(self, task: str, test_class): diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index bb6935461d7..0790f6329dc 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -144,6 +144,7 @@ "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", + "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution", "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "trocr": "microsoft/trocr-small-handwritten", From f7c3a7fa766f06af63a15e94b162ada56d021b16 Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Fri, 27 Sep 2024 14:12:14 +0200 Subject: [PATCH 07/50] CI - update runner type (#2033) update runner type --- .github/workflows/test_fx_automatic_parallel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml index d8af6e40caa..05ebf7ea9e5 100644 --- a/.github/workflows/test_fx_automatic_parallel.yml +++ b/.github/workflows/test_fx_automatic_parallel.yml @@ -24,7 +24,7 @@ jobs: config: - name: GPU-enabled Optimum Test Suite image: nvidia/cuda:12.4.1-devel-ubuntu22.04 - gpu_target: ["nvidia-multi-gpu-a10-runners"] + gpu_target: ["aws-g5-12xlarge-plus"] name: ${{ matrix.config.name }} runs-on: From c6b46786ce12b3a9d2e8be2b8f41342ec314f46a Mon Sep 17 00:00:00 2001 From: rbrugaro Date: Mon, 30 Sep 2024 02:50:23 -0700 Subject: [PATCH 08/50] Add ipex to documentation (#2027) * adding ipex reference in optimum docs * minor fix --- docs/source/index.mdx | 2 +- docs/source/installation.mdx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 7eb79c33ed2..06133664ca8 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -36,7 +36,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
Intel
-

Optimize your model to speedup inference with OpenVINO and Neural Compressor

+

Optimize your model to speedup inference with OpenVINO , Neural Compressor and IPEX

AWS Trainium/Inferentia
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c08b3f92e5c..27733574c80 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can | [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview) | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]` | | [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` | | [OpenVINO](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[openvino]` | +| [IPEX](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[ipex]` | | [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview) | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia` | | [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index) | `pip install --upgrade --upgrade-strategy eager optimum[amd]` | | [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index) | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]` | From 049b00f61c9bb17bd2b20a3b77d04cc4c0f20d86 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:51:01 +0200 Subject: [PATCH 09/50] Add Transformers v4.45 support (#2023) * transformers v4.45 support * fix transformers v4.45 compatibility * update opset * update model * Add generation config saving * fix codegen * bump default opset m2m100 * fix codegen * fix bettertransformers * add warnign deprecation bettertransformer * bettertransformers fixes * disable transformers 4.45 for onnx export * update model ID --- Makefile | 4 +- optimum/bettertransformer/models/attention.py | 84 +++++++++++++++++-- .../models/decoder_models.py | 35 +++++++- optimum/bettertransformer/transformation.py | 4 + optimum/exporters/onnx/convert.py | 18 ++++ optimum/exporters/onnx/model_configs.py | 11 +-- optimum/modeling_base.py | 3 + optimum/onnxruntime/modeling_decoder.py | 58 ++++++++----- optimum/onnxruntime/modeling_ort.py | 3 - optimum/onnxruntime/modeling_seq2seq.py | 64 +++++++------- optimum/onnxruntime/optimization.py | 12 ++- setup.py | 9 +- tests/bettertransformer/testing_utils.py | 4 +- tests/onnxruntime/utils_onnxruntime_tests.py | 6 +- 14 files changed, 223 insertions(+), 92 deletions(-) diff --git a/Makefile b/Makefile index e2c21263031..824ef3d0cf3 100644 --- a/Makefile +++ b/Makefile @@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) # Run code quality checks style_check: black --check . - ruff . + ruff check . style: black . - ruff . --fix + ruff check . --fix # Run tests for the library test: diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 9dfa57844d4..22b8faf1c21 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -92,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product( return sdpa_result, None +# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn +def gptj_wrapped_scaled_dot_product( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, +): + raise_on_head_mask(head_mask) + batch_size = query.shape[0] + + mask_value = torch.finfo(value.dtype).min + mask_value = torch.full([], mask_value, dtype=value.dtype) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + query = query.to(value.dtype) + key = key.to(value.dtype) + + if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1: + raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.") + + dropout_p = self.dropout_prob_attn if self.training else 0.0 + if batch_size == 1 or self.training: + if query.shape[2] > 1: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True + ) + else: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False + ) + else: + query_length, key_length = query.size(-2), key.size(-2) + + # causal_mask is always [True, ..., True] otherwise, so executing this + # is unnecessary + if query_length > 1: + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + + causal_mask = torch.where(causal_mask, 0, mask_value) + + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + if attention_mask is not None: + attention_mask = causal_mask + attention_mask + + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False + ) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + sdpa_result = sdpa_result.to(value.dtype) + + return sdpa_result, None + + # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn def bark_wrapped_scaled_dot_product( self, @@ -195,7 +260,7 @@ def codegen_wrapped_scaled_dot_product( query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True ) else: - # in this case, which is the later decoding steps, the `causal_mask`` in + # in this case, which is the later decoding steps, the `causal_mask` in # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195 # is [True, ..., True] so actually not causal sdpa_result = torch.nn.functional.scaled_dot_product_attention( @@ -207,15 +272,20 @@ def codegen_wrapped_scaled_dot_product( # causal_mask is always [True, ..., True] otherwise, so executing this # is unnecessary if query_length > 1: - causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to( + torch.bool + ) - causal_mask = torch.where(causal_mask, 0, mask_value) + causal_mask = torch.where(causal_mask, 0, mask_value) - # torch.Tensor.expand does no memory copy - causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) - # we use torch.min to avoid having tensor(-inf) - attention_mask = torch.min(causal_mask, attention_mask) + # we use torch.min to avoid having tensor(-inf) + attention_mask = torch.min(causal_mask, attention_mask) + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] sdpa_result = torch.nn.functional.scaled_dot_product_attention( query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index b64b7f5a1eb..52d28d076d3 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -44,6 +44,7 @@ codegen_wrapped_scaled_dot_product, gpt2_wrapped_scaled_dot_product, gpt_neo_wrapped_scaled_dot_product, + gptj_wrapped_scaled_dot_product, opt_forward, t5_forward, ) @@ -82,7 +83,7 @@ def forward(self, *args, **kwargs): class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module): - _attn = gpt2_wrapped_scaled_dot_product + _attn = gptj_wrapped_scaled_dot_product def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super().__init__(config) @@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): "out_proj", "attn_dropout", "resid_dropout", - "bias", "scale_attn", - "masked_bias", ] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "bias"): + submodules.append("bias") + if hasattr(layer, "masked_bias"): + submodules.append("masked_bias") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_id"): + submodules.append("layer_id") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -238,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super(BetterTransformerBaseLayer, self).__init__(config) self.module_mapping = None - submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"] + submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "causal_mask"): + submodules.append("causal_mask") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index 2105e199870..a101757b6fa 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -206,6 +206,10 @@ def transform( The converted model if the conversion has been successful. """ + logger.warning( + "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release." + ) + hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 63a9067b90c..f2bf95f3e3c 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,6 +26,7 @@ import numpy as np import onnx +import transformers from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -34,6 +35,7 @@ DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, TORCH_MINIMUM_VERSION, + check_if_transformers_greater, is_diffusers_available, is_torch_onnx_support_available, logging, @@ -999,6 +1001,10 @@ def onnx_export_from_model( >>> onnx_export_from_model(model, output="gpt2_onnx/") ``` """ + if check_if_transformers_greater("4.44.99"): + raise ImportError( + f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}" + ) TasksManager.standardize_model_attributes(model) @@ -1120,6 +1126,18 @@ def onnx_export_from_model( if isinstance(atol, dict): atol = atol[task.replace("-with-past", "")] + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + if model.can_generate() and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) + # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d4b15b2968b..36963a986d0 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: class AlbertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class ConvBertOnnxConfig(BertOnnxConfig): @@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig): class RobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class CamembertOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class FlaubertOnnxConfig(BertOnnxConfig): @@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig): class XLMRobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class DebertaOnnxConfig(BertOnnxConfig): @@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig): class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads") @@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class M2M100OnnxConfig(TextSeq2SeqOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( encoder_num_layers="encoder_layers", decoder_num_layers="decoder_layers", diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 3da2d9d0d21..29521b7c0c6 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -371,6 +371,9 @@ def from_pretrained( export = from_transformers if len(model_id.split("@")) == 2: + logger.warning( + f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead." + ) if revision is not None: logger.warning( f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}" diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index f6d4b7e20ab..bda3ec98d9a 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -14,7 +14,6 @@ """Classes handling causal-lm related architectures in ONNX Runtime.""" import logging -import warnings from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union @@ -149,6 +148,19 @@ def __init__( generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + self.onnx_paths = [self.model_path] self.use_merged = "use_cache_branch" in self.input_names self.model_type = self.config.model_type @@ -393,7 +405,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -410,15 +421,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -586,6 +589,22 @@ def _from_pretrained( else: init_cls = ORTModelForCausalLM + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + return init_cls( model=model, config=config, @@ -593,6 +612,7 @@ def _from_pretrained( model_save_dir=model_save_dir, preprocessors=preprocessors, use_cache=use_cache, + generation_config=generation_config, ) @classmethod @@ -600,7 +620,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -616,15 +635,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - file_name = ONNX_WEIGHTS_NAME if use_merged: @@ -655,8 +665,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -712,6 +720,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> for layer_past in past ) + def _save_pretrained(self, save_directory: Union[str, Path]): + super()._save_pretrained(save_directory) + self.generation_config.save_pretrained(save_directory) + class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 9166f7c2cbe..17bd3e2a4e7 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -663,8 +663,6 @@ def _export( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -1171,7 +1169,6 @@ def _export( library_name="transformers", ) - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 3cecadafe3e..fda3ca82bbe 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -18,7 +18,6 @@ import logging import shutil -import warnings from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -706,6 +705,18 @@ def show_deprecated_argument(arg_name): generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + @abstractmethod def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: pass @@ -780,7 +791,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -799,15 +809,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -996,19 +998,21 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - generation_config = None - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - ) - except OSError: - logger.info("Generation config file not found, using a generation config created from the model config.") + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) onnx_paths = [encoder_path] if use_merged is False: @@ -1035,7 +1039,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -1051,15 +1054,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForConditionalGeneration": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - if use_cache is False and use_merged is True: raise ValueError( "The incompatible arguments use_cache=False, use_merged=True were passed to" @@ -1091,8 +1085,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py index 9e62a3f324c..fd6958bba7d 100644 --- a/optimum/onnxruntime/optimization.py +++ b/optimum/onnxruntime/optimization.py @@ -20,6 +20,7 @@ import onnx from onnx import load_model +from transformers import GenerationConfig from transformers.models.auto.configuration_auto import AutoConfig from onnxruntime.transformers.onnx_model_bert import BertOnnxModel @@ -152,10 +153,6 @@ def optimize( save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config) - - self.config.save_pretrained(save_dir) - maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) - model_type = ORTConfigManager.get_model_ort_type(self.config.model_type) optimization_options = optimization_config.create_fusion_options(model_type) @@ -236,6 +233,13 @@ def optimize( # Save the model configuration self.config.save_pretrained(save_dir) ort_config.save_pretrained(save_dir) + maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) + + try: + generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent) + generation_config.save_pretrained(save_dir) + except Exception: + pass logger.info( f"Optimized model saved at: {save_dir} (external data format: " diff --git a/setup.py b/setup.py index ac5db71a74b..24c1ae1cd4d 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29,<4.45.0", + "transformers[sentencepiece]>=4.29,<4.46.0", "torch>=1.11", "packaging", "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 @@ -54,6 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", + "transformers<4.45.0", ], "onnxruntime-gpu": [ "onnx", @@ -62,9 +63,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. + "transformers<4.45.0", ], - "exporters": ["onnx", "onnxruntime", "timm"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -75,6 +77,7 @@ "numpy<1.24.0", "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", + "transformers<4.45.0", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index e9e2edd9790..098882180aa 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -59,12 +59,12 @@ # "llama": "fxmarty/tiny-llama-fast-tokenizer", # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer", "m2m_100": "hf-internal-testing/tiny-random-nllb", - "marian": "fxmarty/tiny-marian", # the other tiny ones have a too small max_position_embeddings + "marian": "optimum-internal-testing/tiny-random-marian", # the other tiny ones have a too small max_position_embeddings "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "hf-internal-testing/tiny-random-mbart", "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", - "prophetnet": "hirotasoshu/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings + "prophetnet": "optimum-internal-testing/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings "rembert": "hf-internal-testing/tiny-random-RemBertModel", "roberta": "hf-internal-testing/tiny-random-RobertaModel", "rocbert": "hf-internal-testing/tiny-random-RoCBertModel", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 0790f6329dc..17f3b391b04 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -112,9 +112,9 @@ "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", "longt5": "hf-internal-testing/tiny-random-LongT5Model", - "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama": "optimum-internal-testing/tiny-random-llama", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", - "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken + "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", @@ -152,7 +152,7 @@ "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel", "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2", "vit": "hf-internal-testing/tiny-random-vit", - "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken + "whisper": "optimum-internal-testing/tiny-random-whisper", "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", From d9754abdd973a69829dda191c495c4e70359d8dc Mon Sep 17 00:00:00 2001 From: Vijay Date: Tue, 8 Oct 2024 15:58:24 +0530 Subject: [PATCH 10/50] Remove numpy version constraint in setup.py (#2039) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 24c1ae1cd4d..0e2f0fd1bb6 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "transformers[sentencepiece]>=4.29,<4.46.0", "torch>=1.11", "packaging", - "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 + "numpy", "huggingface_hub>=0.8.0", "datasets", ] From d3c56cd55444de15499c8d72a501d07631eff5ae Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:17:43 +0200 Subject: [PATCH 11/50] Update/Fix Pipeline Mixins and ORT Pipelines (#2021) * created auto task mappings * added correct auto classes * created auto task mappings * added correct auto classes * added ort/auto diffusion classes * fix ORTPipeline detection * start test refactoring * dynamic dtype * support torch random numbers generator * compact diffusion testing suite * fix * test * test * test * use latent-consistency architecture name instead of lcm * fix * add ort diffusion pipeline tests * added dummy objects * remove duplicate code * update stable diffusion mixin * update latent consistency * update sd for img2img * update latent consistency * update model parts to use frozen dict * update tests and utils * updated all mixins, enabled all tests ; all are passing except some reproducibility and comparaison tests (7 failed, 35 passed) * fix sd xl hidden states * style * support testing without diffusers * remove unnecessary * revert * export vae encoder by returning its latent distribution parameters * fix the modeling to handle distributions * create vae class to minimize changes in pipeline mixins * remove unnecessary tests * style * style * update diffusion models export test * style * fall back for when block_out_channels is not in vae config * remove model parts from optimum.onnxruntime * added .to to model parts * remove custom mixins * style * Update optimum/exporters/onnx/model_configs.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/exporters/onnx/model_configs.py * conversion to numpy always work * test adding two new pipelines * remove duplicated tests * match diffusers numpy input * simplify model saving * extend tests and only translate generators * cleanup * reduce parent model usage in model parts * fix * new tiny onnx diffusion model with configs * model_save_path * Update optimum/onnxruntime/modeling_diffusion.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * migrate tiny-stable-diffusion-onnx * resolve breaking change and mandatory arguments * overwrite _get_add_time_ids * fix * remove inference calls from loading tests * misc * better compatibility between model parts and parent pipeline * remove subfolder * misc * update * support passing safety checker * dummies * remove the need for ORTPipeline --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/onnx/model_configs.py | 6 +- optimum/exporters/utils.py | 24 +- optimum/onnx/utils.py | 16 + optimum/onnxruntime/__init__.py | 8 + optimum/onnxruntime/base.py | 19 + optimum/onnxruntime/modeling_diffusion.py | 967 ++++++++++-------- optimum/onnxruntime/modeling_ort.py | 2 +- optimum/onnxruntime/modeling_seq2seq.py | 2 +- optimum/onnxruntime/utils.py | 15 + .../diffusers/pipeline_latent_consistency.py | 230 ----- .../diffusers/pipeline_stable_diffusion.py | 427 -------- .../pipeline_stable_diffusion_img2img.py | 309 ------ .../pipeline_stable_diffusion_inpaint.py | 353 ------- .../diffusers/pipeline_stable_diffusion_xl.py | 506 --------- .../pipeline_stable_diffusion_xl_img2img.py | 515 ---------- optimum/pipelines/diffusers/pipeline_utils.py | 282 ----- optimum/pipelines/diffusers/watermark.py | 31 - tests/exporters/onnx/test_onnx_export.py | 15 +- tests/onnxruntime/test_diffusion.py | 578 +++++------ tests/onnxruntime/test_modeling.py | 14 +- tests/onnxruntime/utils_onnxruntime_tests.py | 5 + 21 files changed, 914 insertions(+), 3410 deletions(-) delete mode 100644 optimum/pipelines/diffusers/pipeline_latent_consistency.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_utils.py delete mode 100644 optimum/pipelines/diffusers/watermark.py diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 36963a986d0..e77f649f69b 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1112,7 +1112,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-2 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1132,12 +1132,12 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, + "latent_parameters": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } class VaeDecoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-3 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index e2125736c4d..949b54f4685 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -46,11 +46,6 @@ from diffusers import ( DiffusionPipeline, - LatentConsistencyModelImg2ImgPipeline, - LatentConsistencyModelPipeline, - StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, - StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, @@ -92,27 +87,13 @@ def _get_submodels_for_export_diffusion( Returns the components of a Stable Diffusion model. """ - is_stable_diffusion = isinstance( - pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline) - ) is_stable_diffusion_xl = isinstance( pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline) ) - is_latent_consistency_model = isinstance( - pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline) - ) - if is_stable_diffusion_xl: projection_dim = pipeline.text_encoder_2.config.projection_dim - elif is_stable_diffusion: - projection_dim = pipeline.text_encoder.config.projection_dim - elif is_latent_consistency_model: - projection_dim = pipeline.text_encoder.config.projection_dim else: - raise ValueError( - f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. " - "Please open an issue or submit a PR to add the support." - ) + projection_dim = pipeline.text_encoder.config.projection_dim models_for_export = {} @@ -139,7 +120,8 @@ def _get_submodels_for_export_diffusion( vae_encoder = copy.deepcopy(pipeline.vae) if not is_torch_greater_or_equal_than_2_1: vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder) - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + # we return the distribution parameters to be able to recreate it in the decoder + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} models_for_export["vae_encoder"] = vae_encoder # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py index b52c4f4cdac..c014c1b3429 100644 --- a/optimum/onnx/utils.py +++ b/optimum/onnx/utils.py @@ -71,6 +71,22 @@ def _get_external_data_paths(src_paths: List[Path], dst_paths: List[Path]) -> Tu return src_paths, dst_paths +def _get_model_external_data_paths(model_path: Path) -> List[Path]: + """ + Gets external data paths from the model. + """ + + onnx_model = onnx.load(str(model_path), load_external_data=False) + model_tensors = _get_initializer_tensors(onnx_model) + # filter out tensors that are not external data + model_tensors_ext = [ + ExternalDataInfo(tensor).location + for tensor in model_tensors + if tensor.HasField("data_location") and tensor.data_location == onnx.TensorProto.EXTERNAL + ] + return [model_path.parent / tensor_name for tensor_name in model_tensors_ext] + + def check_model_uses_external_data(model: onnx.ModelProto) -> bool: """ Checks if the model uses external data. diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 1cb5b7c47b9..4e25a436909 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,7 +79,9 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", "ORTLatentConsistencyModelPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", "ORTPipelineForText2Image", @@ -92,6 +94,8 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", @@ -148,6 +152,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -156,11 +161,13 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: from .modeling_diffusion import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -169,6 +176,7 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index 0e54bafed78..845780cafad 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -71,6 +71,25 @@ def dtype(self): return None + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a model part without changing the device of the parent model. " + "Please use the `to` method of the parent model to change the device." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) + @abstractmethod def forward(self, *args, **kwargs): pass diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 18cd38c5f29..87fcb68c7e9 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -13,10 +13,11 @@ # limitations under the License. import importlib +import inspect import logging import os import shutil -import warnings +from abc import abstractmethod from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory @@ -24,23 +25,25 @@ import numpy as np import torch -from diffusers import ( +from diffusers.configuration_utils import ConfigMixin +from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - ConfigMixin, - DDIMScheduler, + LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, - LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available +from diffusers.utils.constants import CONFIG_NAME from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.utils import validate_hf_hub_args @@ -51,14 +54,7 @@ import onnxruntime as ort from ..exporters.onnx import main_export -from ..onnx.utils import _get_external_data_paths -from ..pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_utils import VaeImageProcessor +from ..onnx.utils import _get_model_external_data_paths from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -66,12 +62,12 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) -from .base import ORTModelPart from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( ONNX_WEIGHTS_NAME, get_provider_for_device, + np_to_pt_generators, parse_device, validate_provider_availability, ) @@ -80,380 +76,287 @@ logger = logging.getLogger(__name__) -class ORTPipeline(ORTModel): - auto_model_class = None - model_type = "onnx_pipeline" - +# TODO: support from_pipe() +# TODO: Instead of ORTModel, it makes sense to have a compositional ORTMixin +# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline +class ORTDiffusionPipeline(ORTModel, DiffusionPipeline): config_name = "model_index.json" - sub_component_config_name = "config.json" + auto_model_class = DiffusionPipeline def __init__( self, - vae_decoder_session: ort.InferenceSession, + scheduler: "SchedulerMixin", unet_session: ort.InferenceSession, - tokenizer: CLIPTokenizer, - config: Dict[str, Any], - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - feature_extractor: Optional[CLIPFeatureExtractor] = None, + vae_decoder_session: ort.InferenceSession, + # optional pipeline models vae_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, + # optional pipeline submodels + tokenizer: Optional["CLIPTokenizer"] = None, + tokenizer_2: Optional["CLIPTokenizer"] = None, + feature_extractor: Optional["CLIPFeatureExtractor"] = None, + # stable diffusion xl specific arguments + force_zeros_for_empty_prompt: bool = True, + requires_aesthetics_score: bool = False, + add_watermarker: Optional[bool] = None, + # onnxruntime specific arguments use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + **kwargs, ): - """ - Args: - vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder - unet_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the U-NET. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the text encoder. - config (`Dict[str, Any]`): - A config dictionary from which the model components will be instantiated. Make sure to only load - configuration files of compatible classes. - scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): - A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. - feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): - A model extracting features from generated images to be used as inputs for the `safety_checker` - vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the VAE encoder. - text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the text encoder. - tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the second text encoder. - use_io_binding (`Optional[bool]`, defaults to `None`): - Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to - `True` if the device is CUDA, otherwise defaults to `False`. - model_save_dir (`Optional[str]`, defaults to `None`): - The directory under which the model exported to ONNX was saved. - """ - self.shared_attributes_init( - model=vae_decoder_session, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) - self._internal_dict = config - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_decoder_model_path = Path(vae_decoder_session._model_path) self.unet = ORTModelUnet(unet_session, self) - self.unet_model_path = Path(unet_session._model_path) - - if text_encoder_session is not None: - self.text_encoder_model_path = Path(text_encoder_session._model_path) - self.text_encoder = ORTModelTextEncoder(text_encoder_session, self) - else: - self.text_encoder_model_path = None - self.text_encoder = None - - if vae_encoder_session is not None: - self.vae_encoder_model_path = Path(vae_encoder_session._model_path) - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) - else: - self.vae_encoder_model_path = None - self.vae_encoder = None + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.text_encoder = ( + ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None + ) + self.text_encoder_2 = ( + ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None + ) + # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API + self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) - if text_encoder_2_session is not None: - self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path) - self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self) - else: - self.text_encoder_2_model_path = None - self.text_encoder_2 = None + # we allow passing these as torch models for now + self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe implement ORTModelSafetyChecker + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 - self.scheduler = scheduler self.feature_extractor = feature_extractor - self.safety_checker = None - - sub_models = { - DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, - DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, - DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder, - DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder, - DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2, - } - - # Modify config to keep the resulting model compatible with diffusers pipelines - for name in sub_models.keys(): - self._internal_dict[name] = ( - ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) - ) - self._internal_dict.pop("vae", None) - - if "block_out_channels" in self.vae_decoder.config: - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) - else: - self.vae_scale_factor = 8 - - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - @staticmethod - def load_model( - vae_decoder_path: Union[str, Path], - text_encoder_path: Union[str, Path], - unet_path: Union[str, Path], - vae_encoder_path: Optional[Union[str, Path]] = None, - text_encoder_2_path: Optional[Union[str, Path]] = None, - provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, - provider_options: Optional[Dict] = None, - ): - """ - Creates three inference sessions for respectively the VAE decoder, the text encoder and the U-NET models. - The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX. - Args: - vae_decoder_path (`Union[str, Path]`): - The path to the VAE decoder ONNX model. - text_encoder_path (`Union[str, Path]`): - The path to the text encoder ONNX model. - unet_path (`Union[str, Path]`): - The path to the U-NET ONNX model. - vae_encoder_path (`Union[str, Path]`, defaults to `None`): - The path to the VAE encoder ONNX model. - text_encoder_2_path (`Union[str, Path]`, defaults to `None`): - The path to the second text decoder ONNX model. - provider (`str`, defaults to `"CPUExecutionProvider"`): - ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ - for possible providers. - session_options (`Optional[ort.SessionOptions]`, defaults to `None`): - ONNX Runtime session options to use for loading the model. Defaults to `None`. - provider_options (`Optional[Dict]`, defaults to `None`): - Provider option dictionary corresponding to the provider used. See available options - for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. - """ - vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options) - unet = ORTModel.load_model(unet_path, provider, session_options, provider_options) - - sessions = { - "vae_encoder": vae_encoder_path, - "text_encoder": text_encoder_path, - "text_encoder_2": text_encoder_2_path, + all_pipeline_init_args = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + "scheduler": self.scheduler, + "tokenizer": self.tokenizer, + "tokenizer_2": self.tokenizer_2, + "feature_extractor": self.feature_extractor, + "requires_aesthetics_score": requires_aesthetics_score, + "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, + "add_watermarker": add_watermarker, } - for key, value in sessions.items(): - if value is not None and value.is_file(): - sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options) - else: - sessions[key] = None + diffusers_pipeline_args = {} + for key in inspect.signature(self.auto_model_class).parameters.keys(): + if key in all_pipeline_init_args: + diffusers_pipeline_args[key] = all_pipeline_init_args[key] + # inits diffusers pipeline specific attributes (registers modules and config) + self.auto_model_class.__init__(self, **diffusers_pipeline_args) - return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"] + # inits ort specific attributes + self.shared_attributes_init( + model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs + ) def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) - src_to_dst_path = { - self.vae_decoder_model_path: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.text_encoder_model_path: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - } - sub_models_to_save = { - self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + models_to_save_paths = { + (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), + (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), + (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), + (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), } - for path, subfolder in sub_models_to_save.items(): - if path is not None: - src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME - - # TODO: Modify _get_external_data_paths to give dictionnary - src_paths = list(src_to_dst_path.keys()) - dst_paths = list(src_to_dst_path.values()) - # Add external data paths in case of large models - src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths) - - for src_path, dst_path in zip(src_paths, dst_paths): - dst_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(src_path, dst_path) - config_path = src_path.parent / self.sub_component_config_name - if config_path.is_file(): - shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name) + for model, save_path in models_to_save_paths: + if model is not None: + model_path = Path(model.session._model_path) + save_path.mkdir(parents=True, exist_ok=True) + # copy onnx model + shutil.copyfile(model_path, save_path / ONNX_WEIGHTS_NAME) + # copy external onnx data + external_data_paths = _get_model_external_data_paths(model_path) + for external_data_path in external_data_paths: + shutil.copyfile(external_data_path, save_path / external_data_path.name) + # copy model config + config_path = model_path.parent / CONFIG_NAME + if config_path.is_file(): + config_save_path = save_path / CONFIG_NAME + shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") - if self.feature_extractor is not None: - self.feature_extractor.save_pretrained(save_directory / "feature_extractor") if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @classmethod def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, + subfolder: str = "", + force_download: bool = False, + local_files_only: bool = False, revision: Optional[str] = None, + trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, - vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, - text_encoder_file_name: str = ONNX_WEIGHTS_NAME, + token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, + vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, + text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, - local_files_only: bool = False, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, + session_options: Optional[ort.SessionOptions] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, + if use_io_binding: + raise ValueError( + "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False." ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - - if provider == "TensorrtExecutionProvider": - raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported") - model_id = str(model_id) - patterns = set(config.keys()) - sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}) - - if not os.path.isdir(model_id): - patterns.update({"vae_encoder", "vae_decoder"}) - allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} + if not os.path.isdir(str(model_id)): + all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} + allow_patterns = {os.path.join(component, "*") for component in all_components} allow_patterns.update( { - vae_decoder_file_name, - text_encoder_file_name, unet_file_name, + vae_decoder_file_name, vae_encoder_file_name, + text_encoder_file_name, text_encoder_2_file_name, SCHEDULER_CONFIG_NAME, - CONFIG_NAME, cls.config_name, + CONFIG_NAME, } ) - # Downloads all repo's files matching the allowed patterns - model_id = snapshot_download( + model_save_folder = snapshot_download( model_id, cache_dir=cache_dir, + force_download=force_download, local_files_only=local_files_only, - token=token, revision=revision, + token=token, allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"], ) - new_model_save_dir = Path(model_id) + else: + model_save_folder = str(model_id) + + model_save_path = Path(model_save_folder) + + if model_save_dir is None: + model_save_dir = model_save_path + + model_paths = { + "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + } + + sessions = {} + for model, path in model_paths.items(): + if kwargs.get(model, None) is not None: + # this allows passing a model directly to from_pretrained + sessions[f"{model}_session"] = kwargs.pop(model) + else: + sessions[f"{model}_session"] = ( + ORTModel.load_model(path, provider, session_options, provider_options) if path.is_file() else None + ) - sub_models = {} - for name in sub_models_to_load: - library_name, library_classes = config[name] - if library_classes is not None: + submodels = {} + for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: + if kwargs.get(submodel, None) is not None: + submodels[submodel] = kwargs.pop(submodel) + elif config.get(submodel, (None, None))[0] is not None: + library_name, library_classes = config.get(submodel) library = importlib.import_module(library_name) class_obj = getattr(library, library_classes) load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory - if (new_model_save_dir / name).is_dir(): - sub_models[name] = load_method(new_model_save_dir / name) + if (model_save_path / submodel).is_dir(): + submodels[submodel] = load_method(model_save_path / submodel) else: - sub_models[name] = load_method(new_model_save_dir) - - vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model( - vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=( - new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name - ), - provider=provider, - session_options=session_options, - provider_options=provider_options, - ) - - if model_save_dir is None: - model_save_dir = new_model_save_dir + submodels[submodel] = load_method(model_save_path) - if use_io_binding: - raise ValueError( - "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False." - ) + # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config + if cls.__name__ == "ORTDiffusionPipeline": + class_name = config["_class_name"] + ort_pipeline_class = _get_ort_class(class_name) + else: + ort_pipeline_class = cls - return cls( - vae_decoder_session=vae_decoder, - text_encoder_session=text_encoder, - unet_session=unet, - config=config, - tokenizer=sub_models.get("tokenizer", None), - scheduler=sub_models.get("scheduler"), - feature_extractor=sub_models.get("feature_extractor", None), - tokenizer_2=sub_models.get("tokenizer_2", None), - vae_encoder_session=vae_encoder, - text_encoder_2_session=text_encoder_2, + ort_pipeline = ort_pipeline_class( + **sessions, + **submodels, use_io_binding=use_io_binding, model_save_dir=model_save_dir, + **kwargs, ) + # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from + ort_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id))) + + return ort_pipeline + @classmethod - def _from_transformers( + def _export( cls, model_id: str, - config: Optional[str] = None, - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, - revision: str = "main", - force_download: bool = True, - cache_dir: str = HUGGINGFACE_HUB_CACHE, + config: Dict[str, Any], subfolder: str = "", + force_download: bool = False, local_files_only: bool = False, + revision: Optional[str] = None, trust_remote_code: bool = False, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTPipeline": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + **kwargs, + ) -> "ORTDiffusionPipeline": if task is None: task = cls._auto_model_to_task(cls.auto_model_class) - save_dir = TemporaryDirectory() - save_dir_path = Path(save_dir.name) + # we continue passing the model_save_dir from here on to avoid it being cleaned up + # might be better to use a persistent temporary directory such as the one implemented in + # https://gist.github.com/twolfson/2929dc1163b0a76d2c2b66d51f9bc808 + model_save_dir = TemporaryDirectory() + model_save_path = Path(model_save_dir.name) main_export( - model_name_or_path=model_id, - output=save_dir_path, - task=task, + model_id, + output=model_save_path, do_validation=False, no_post_process=True, - subfolder=subfolder, + token=token, revision=revision, cache_dir=cache_dir, - token=token, - local_files_only=local_files_only, + subfolder=subfolder, force_download=force_download, + local_files_only=local_files_only, trust_remote_code=trust_remote_code, + library_name="diffusers", + task=task, ) return cls._from_pretrained( - save_dir_path, + model_save_path, config=config, provider=provider, - session_options=session_options, provider_options=provider_options, + session_options=session_options, use_io_binding=use_io_binding, - model_save_dir=save_dir, + model_save_dir=model_save_dir, + **kwargs, ) def to(self, device: Union[torch.device, str, int]): @@ -471,19 +374,22 @@ def to(self, device: Union[torch.device, str, int]): device, provider_options = parse_device(device) provider = get_provider_for_device(device) - validate_provider_availability(provider) # raise error if the provider is not available + validate_provider_availability(provider) if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self - self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) - self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) self.unet.session.set_providers([provider], provider_options=[provider_options]) + self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder is not None: + self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder_2 is not None: + self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) - self.providers = self.vae_decoder.session.get_providers() + self.providers = self.unet.session.get_providers() self._device = device return self @@ -495,41 +401,142 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): def _save_config(self, save_directory): self.save_config(save_directory) + @property + def components(self) -> Dict[str, Any]: + components = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + } + components = {k: v for k, v in components.items() if v is not None} + return components -class ORTPipelinePart(ORTModelPart): - CONFIG_NAME = "config.json" + def __call__(self, *args, **kwargs): + # we do this to keep numpy random states support for now + # TODO: deprecate and add warnings when a random state is passed - def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): - config_path = Path(session._model_path).parent / self.CONFIG_NAME + args = list(args) + for i in range(len(args)): + args[i] = np_to_pt_generators(args[i], self.device) - if config_path.is_file(): - # TODO: use FrozenDict - self.config = parent_model._dict_from_json_file(config_path) - else: - self.config = {} + for k, v in kwargs.items(): + kwargs[k] = np_to_pt_generators(v, self.device) + + return self.auto_model_class.__call__(self, *args, **kwargs) - super().__init__(session, parent_model) + +class ORTPipelinePart(ConfigMixin): + config_name: str = CONFIG_NAME + + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionPipeline): + self.session = session + self.parent_pipeline = parent_pipeline + + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} + + config_file_path = Path(session._model_path).parent / self.config_name + if not config_file_path.is_file(): + # config is mandatory for the model part to be used for inference + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") + config_dict = self._dict_from_json_file(config_file_path) + self.register_to_config(**config_dict) @property - def input_dtype(self): - # for backward compatibility and diffusion mixins (will be standardized in the future) - return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} + def device(self): + return self.parent_pipeline.device + @property + def dtype(self): + for dtype in self.input_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, (int, str)): + device = torch.device(arg) + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a pipeline part without changing the device of the parent pipeline. " + "Please use the `to` method of the parent pipeline to change the device." + ) -class ORTModelTextEncoder(ORTPipelinePart): - def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(input_ids, torch.Tensor) + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the pipeline from {self.dtype} to {dtype}. " + f"Please export the pipeline with the desired dtype." + ) - model_inputs = {"input_ids": input_ids} + def prepare_onnx_inputs(self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]) -> Dict[str, np.ndarray]: + onnx_inputs = {} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) - onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): + onnx_inputs[input_name] = inputs.pop(input_name) - return ModelOutput(**model_outputs) + if use_torch: + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) + + if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: + onnx_inputs[input_name] = onnx_inputs[input_name].astype( + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + ) + + return onnx_inputs + + def prepare_onnx_outputs( + self, use_torch: bool, *onnx_outputs: np.ndarray + ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + model_outputs = {} + + # converts onnxruntime outputs into tensor for standard outputs + for output_name, idx in self.output_names.items(): + model_outputs[output_name] = onnx_outputs[idx] + + if use_torch: + model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device) + + return model_outputs + + @abstractmethod + def forward(self, *args, **kwargs): + raise NotImplementedError + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) class ORTModelUnet(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "time_cond_proj_dim"): + logger.warning( + "The `time_cond_proj_dim` attribute is missing from the UNet configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(time_cond_proj_dim=None) + def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -538,9 +545,15 @@ def forward( text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, ): use_torch = isinstance(sample, torch.Tensor) + if len(timestep.shape) == 0: + timestep = timestep.unsqueeze(0) + model_inputs = { "sample": sample, "timestep": timestep, @@ -548,171 +561,323 @@ def forward( "text_embeds": text_embeds, "time_ids": time_ids, "timestep_cond": timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), } - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(ORTPipelinePart): - def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(latent_sample, torch.Tensor) +class ORTModelTextEncoder(ORTPipelinePart): + def forward( + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + ): + use_torch = isinstance(input_ids, torch.Tensor) - model_inputs = {"latent_sample": latent_sample} + model_inputs = {"input_ids": input_ids} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if output_hidden_states: + model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): + model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + else: + for i in range(self.config.num_hidden_layers): + model_outputs.pop(f"hidden_states.{i}", None) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) class ORTModelVaeEncoder(ORTPipelinePart): - def forward(self, sample: Union[np.ndarray, torch.Tensor]): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE encoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + + def forward( + self, + sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): use_torch = isinstance(sample, torch.Tensor) model_inputs = {"sample": sample} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + + if "latent_parameters" in model_outputs: + model_outputs["latent_dist"] = DiagonalGaussianDistribution( + parameters=model_outputs.pop("latent_parameters") + ) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) +class ORTModelVaeDecoder(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE decoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + + def forward( + self, + latent_sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): + use_torch = isinstance(latent_sample, torch.Tensor) + + model_inputs = {"latent_sample": latent_sample} + + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + +class ORTWrapperVae(ORTPipelinePart): + def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder): + self.decoder = decoder + self.encoder = encoder + + @property + def config(self): + return self.decoder.config + + @property + def dtype(self): + return self.decoder.dtype + + @property + def device(self): + return self.decoder.device + + def decode(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + def encode(self, *args, **kwargs): + return self.encoder(*args, **kwargs) + + def to(self, *args, **kwargs): + self.decoder.to(*args, **kwargs) + if self.encoder is not None: + self.encoder.to(*args, **kwargs) + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ main_input_name = "prompt" + export_feature = "text-to-image" auto_model_class = StableDiffusionPipeline - __call__ = StableDiffusionPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ - main_input_name = "prompt" + main_input_name = "image" + export_feature = "image-to-image" auto_model_class = StableDiffusionImg2ImgPipeline - __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ main_input_name = "prompt" + export_feature = "inpainting" auto_model_class = StableDiffusionInpaintPipeline - __call__ = StableDiffusionInpaintPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): +class ORTStableDiffusionXLPipeline(ORTDiffusionPipeline, StableDiffusionXLPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ main_input_name = "prompt" - auto_model_class = LatentConsistencyModelPipeline - - __call__ = LatentConsistencyPipelineMixin.__call__ - + export_feature = "text-to-image" + auto_model_class = StableDiffusionXLPipeline -class ORTStableDiffusionXLPipelineBase(ORTPipeline): - def __init__( + def _get_add_time_ids( self, - vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, - unet_session: ort.InferenceSession, - config: Dict[str, Any], - tokenizer: CLIPTokenizer, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - feature_extractor: Optional[CLIPFeatureExtractor] = None, - vae_encoder_session: Optional[ort.InferenceSession] = None, - text_encoder_2_session: Optional[ort.InferenceSession] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - add_watermarker: Optional[bool] = None, + original_size, + crops_coords_top_left, + target_size, + dtype, + text_encoder_projection_dim=None, ): - super().__init__( - vae_decoder_session=vae_decoder_session, - text_encoder_session=text_encoder_session, - unet_session=unet_session, - config=config, - tokenizer=tokenizer, - scheduler=scheduler, - feature_extractor=feature_extractor, - vae_encoder_session=vae_encoder_session, - text_encoder_2_session=text_encoder_2_session, - tokenizer_2=tokenizer_2, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) + add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids - if add_watermarker: - if not is_invisible_watermark_available(): - raise ImportError( - "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`." - ) - from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTStableDiffusionXLImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionXLImg2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). + """ - self.watermark = StableDiffusionXLWatermarker() + main_input_name = "prompt" + export_feature = "image-to-image" + auto_model_class = StableDiffusionXLImg2ImgPipeline + + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) else: - self.watermark = None + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin): +class ORTStableDiffusionXLInpaintPipeline(ORTDiffusionPipeline, StableDiffusionXLInpaintPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ - main_input_name = "prompt" - auto_model_class = StableDiffusionXLPipeline + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusionXLInpaintPipeline + + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - __call__ = StableDiffusionXLPipelineMixin.__call__ + return add_time_ids, add_neg_time_ids @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyModelPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ main_input_name = "prompt" - auto_model_class = StableDiffusionXLImg2ImgPipeline + export_feature = "text-to-image" + auto_model_class = LatentConsistencyModelPipeline + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). + """ - __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = LatentConsistencyModelImg2ImgPipeline SUPPORTED_ORT_PIPELINES = [ ORTStableDiffusionPipeline, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, - ORTLatentConsistencyModelPipeline, ORTStableDiffusionXLPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ] -def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): +def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: if ( ort_pipeline_class.__name__ == pipeline_class_name @@ -724,31 +889,6 @@ def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") -class ORTDiffusionPipeline(ConfigMixin): - config_name = "model_index.json" - - @classmethod - @validate_hf_hub_args - def from_pretrained(cls, pretrained_model_or_path, **kwargs): - load_config_kwargs = { - "force_download": kwargs.get("force_download", False), - "resume_download": kwargs.get("resume_download", None), - "local_files_only": kwargs.get("local_files_only", False), - "cache_dir": kwargs.get("cache_dir", None), - "revision": kwargs.get("revision", None), - "proxies": kwargs.get("proxies", None), - "token": kwargs.get("token", None), - } - - config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) - config = config[0] if isinstance(config, tuple) else config - class_name = config["_class_name"] - - ort_pipeline_class = _get_pipeline_class(class_name) - - return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) - - ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionPipeline), @@ -761,12 +901,14 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): [ ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ] ) ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline), ] ) @@ -777,7 +919,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ] -def _get_task_class(mapping, pipeline_class_name): +def _get_task_ort_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): @@ -801,7 +943,8 @@ class ORTPipelineForTask(ConfigMixin): config_name = "model_index.json" @classmethod - def from_pretrained(cls, pretrained_model_or_path, **kwargs): + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTDiffusionPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), "resume_download": kwargs.get("resume_download", None), @@ -815,7 +958,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): config = config[0] if isinstance(config, tuple) else config class_name = config["_class_name"] - ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) + ort_pipeline_class = _get_task_ort_class(cls.ort_pipelines_mapping, class_name) return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 17bd3e2a4e7..9b29afa566b 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -938,7 +938,7 @@ def _prepare_onnx_inputs( onnx_inputs[input_name] = inputs.pop(input_name) if use_torch: - onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy() + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: onnx_inputs[input_name] = onnx_inputs[input_name].astype( diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index fda3ca82bbe..27e0dc01b4c 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -67,7 +67,7 @@ if check_if_transformers_greater("4.25.0"): from transformers.generation import GenerationMixin else: - from transformers.generation_utils import GenerationMixin + from transformers.generation_utils import GenerationMixin # type: ignore if check_if_transformers_greater("4.43.0"): diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 985980e31b0..128e2406f11 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -403,3 +403,18 @@ def evaluation_loop( metrics = {} return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset)) + + +def np_to_pt_generators(np_object, device): + if isinstance(np_object, np.random.RandomState): + return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) + elif isinstance(np_object, np.random.Generator): + return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0])) + elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)): + return [np_to_pt_generators(a, device) for a in np_object] + elif isinstance(np_object, dict) and isinstance( + next(iter(np_object.values())), (np.random.RandomState, np.random.Generator) + ): + return {k: np_to_pt_generators(v, device) for k, v in np_object.items()} + else: + return np_object diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py deleted file mode 100644 index 630d463de73..00000000000 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Callable, List, Optional, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin): - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 4, - original_inference_steps: int = None, - guidance_scale: float = 8.5, - num_images_per_prompt: int = 1, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - - # Don't need to get negative prompts due to LCM guided distillation - negative_prompt = None - negative_prompt_embeds = None - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - False, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps) - timesteps = self.scheduler.timesteps - - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config["in_channels"], - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - bs = batch_size * num_images_per_prompt - # get Guidance Scale Embedding - w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype) - w_embedding = self.get_guidance_scale_embedding( - w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype - ) - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latents, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - timestep_cond=w_embedding, - )[0] - - # compute the previous noisy sample x_t -> x_t-1 - latents, denoised = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False - ) - latents, denoised = latents.numpy(), denoised.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = denoised - has_nsfw_concept = None - else: - denoised /= self.vae_decoder.config["scaling_factor"] - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None): - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - timesteps (`torch.Tensor`): - generate embedding vectors at these timesteps - embedding_dim (`int`, *optional*, defaults to 512): - dimension of the embeddings to generate - dtype: - data type of the generated embeddings - - Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` - """ - w = w * 1000 - half_dim = embedding_dim // 2 - emb = np.log(10000.0) / (half_dim - 1) - emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb) - emb = w[:, None] * emb[None, :] - emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1) - - if embedding_dim % 2 == 1: # zero pad - emb = np.pad(emb, [(0, 0), (0, 1)]) - - assert emb.shape == (w.shape[0], embedding_dim) - return emb diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py deleted file mode 100644 index 6cc47fab1b9..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Callable, List, Optional, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionPipelineMixin(DiffusionPipelineMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114 - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # get prompt text embeddings - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids - - if not np.array_equal(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0] - - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] * batch_size - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] - - if do_classifier_free_guidance: - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds]) - - return prompt_embeds - - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217 - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - return latents - - # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264 - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - guidance_rescale: float = 0.0, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - def run_safety_checker(self, image: np.ndarray): - if self.safety_checker is None: - has_nsfw_concept = None - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="np" - ).pixel_values.astype(image.dtype) - images, has_nsfw_concept = [], [] - for i in range(image.shape[0]): - image_i, has_nsfw_concept_i = self.safety_checker( - clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1] - ) - images.append(image_i) - has_nsfw_concept.append(has_nsfw_concept_i[0]) - image = np.concatenate(images) - - return image, has_nsfw_concept diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py deleted file mode 100644 index a66035a789b..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from typing import Callable, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = np.concatenate([init_latents], axis=0) - - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ).numpy() - - return init_latents - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.8, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - - # check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) - - # define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - - image = self.image_processor.preprocess(image) - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - - timesteps = self.scheduler.timesteps.numpy()[-init_timestep] - timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - - # 5. Prepare latent variables - latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:].numpy() - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py deleted file mode 100644 index cb3c7db96e9..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from typing import Callable, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import PIL_INTERPOLATION - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor): - image = np.array( - image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - image = image[None].transpose(0, 3, 1, 2) - image = image.astype(np.float32) / 127.5 - 1.0 - - image_mask = np.array( - mask.convert("L").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - masked_image = image * (image_mask < 127.5) - - mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"]) - mask = np.array(mask.convert("L")) - mask = mask.astype(np.float32) / 255.0 - mask = mask[None, None] - mask[mask < 0.5] = 0 - mask[mask >= 0.5] = 1 - - return mask, masked_image - - -class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image, - mask_image: PIL.Image.Image, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Union[str, List[str]]`): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`PIL.Image.Image`): - `Image`, or tensor representing an image batch which will be upscaled. - mask_image (`PIL.Image.Image`): - `Image`, or tensor representing a masked image batch which will be upscaled. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - num_channels_latents = self.vae_decoder.config.get("latent_channels", 4) - num_channels_unet = self.unet.config.get("in_channels", 9) - latents_shape = ( - batch_size * num_images_per_prompt, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latents_dtype = prompt_embeds.dtype - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*latents_shape).astype(latents_dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") - - # prepare mask and masked_image - mask, masked_image = prepare_mask_and_masked_image( - image, mask_image, latents_shape[-2:], self.vae_scale_factor - ) - mask = mask.astype(latents.dtype) - masked_image = masked_image.astype(latents.dtype) - - masked_image_latents = self.vae_encoder(sample=masked_image)[0] - - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - masked_image_latents = scaling_factor * masked_image_latents - - # duplicate mask and masked_image_latents for each generation per prompt - mask = mask.repeat(batch_size * num_images_per_prompt, 0) - masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0) - - mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = ( - np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents - ) - - # check that sizes of mask, masked image and latents match - if num_channels_unet == 9: - # default case for runwayml/stable-diffusion-inpainting - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet: - raise ValueError( - f"Incorrect configuration settings! The config of `pipeline.unet`: expects" - f" {num_channels_unet} but received `num_channels_latents`: {num_channels_latents} +" - f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" - f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input." - ) - elif num_channels_unet != 4: - raise ValueError( - f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {num_channels_unet}." - ) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - # concat latents, mask, masked_image_latnets in the channel dimension - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - if num_channels_unet == 9: - latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1) - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py deleted file mode 100644 index 0407c16a77a..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ /dev/null @@ -1,506 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if prompt_embeds is not None and pooled_prompt_embeds is None: - raise ValueError( - "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." - ) - - if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: - raise ValueError( - "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - return latents - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - extra_step_kwargs = {} - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta - - return extra_step_kwargs - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - - # 0. Default height and width to unet - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - height, - width, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) - - # 2. Define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - ) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - # 6. Prepare extra step kwargs - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Prepare added time ids & embeddings - add_text_embeds = pooled_prompt_embeds - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - # 8. Denoising loop - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, - ) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py deleted file mode 100644 index 19988599b64..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - def get_timesteps(self, num_inference_steps, strength): - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy() - - return timesteps, num_inference_steps - t_start - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = np.concatenate([init_latents], axis=0) - - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() - - return init_latents - - def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype - ): - if self.config.get("requires_aesthetics_score"): - add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),) - add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),) - else: - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_neg_time_ids = (original_size + crops_coords_top_left + target_size,) - - add_time_ids = np.array(add_time_ids, dtype=dtype) - add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype) - - return add_time_ids, add_neg_time_ids - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.3, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, - aesthetic_score: float = 6.0, - negative_aesthetic_score: float = 2.5, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - # 0. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) - - # 1. Define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 2. Encode input prompt - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - ) - - # 3. Preprocess image - image = self.image_processor.preprocess(image) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0) - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - - # 5. Prepare latent variables - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator - ) - - # 6. Prepare extra step kwargs - extra_step_kwargs = {} - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta - - height, width = latents.shape[-2:] - height = height * self.vae_scale_factor - width = width * self.vae_scale_factor - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 8. Prepare added time ids & embeddings - add_text_embeds = pooled_prompt_embeds - add_time_ids, add_neg_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - dtype=prompt_embeds.dtype, - ) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, - ) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py deleted file mode 100644 index e9d5986b61c..00000000000 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings -from typing import List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers import ConfigMixin -from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor -from diffusers.utils.pil_utils import PIL_INTERPOLATION -from PIL import Image -from tqdm.auto import tqdm - - -class DiffusionPipelineMixin(ConfigMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L812 - @staticmethod - def numpy_to_pil(images): - """ - Converts a numpy image or a batch of images to a PIL image. - """ - if images.ndim == 3: - images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] - else: - pil_images = [Image.fromarray(image) for image in images] - - return pil_images - - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827 - def progress_bar(self, iterable=None, total=None): - if not hasattr(self, "_progress_bar_config"): - self._progress_bar_config = {} - elif not isinstance(self._progress_bar_config, dict): - raise ValueError( - f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}." - ) - - if iterable is not None: - return tqdm(iterable, **self._progress_bar_config) - elif total is not None: - return tqdm(total=total, **self._progress_bar_config) - else: - raise ValueError("Either `total` or `iterable` has to be defined.") - - -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58 -def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): - """ - Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 - """ - std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True) - std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True) - # rescale the results from guidance (fixes overexposure) - noise_pred_rescaled = noise_cfg * (std_text / std_cfg) - # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images - noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg - return noise_cfg - - -class VaeImageProcessor(DiffusersVaeImageProcessor): - # Adapted from diffusers.VaeImageProcessor.denormalize - @staticmethod - def denormalize(images: np.ndarray): - """ - Denormalize an image array to [0,1]. - """ - return np.clip(images / 2 + 0.5, 0, 1) - - # Adapted from diffusers.VaeImageProcessor.preprocess - def preprocess( - self, - image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> np.ndarray: - """ - Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors. - """ - supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - - do_convert_grayscale = getattr(self.config, "do_convert_grayscale", False) - # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image - if do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: - if isinstance(image, torch.Tensor): - # if image is a pytorch tensor could have 2 possible shapes: - # 1. batch x height x width: we should insert the channel dimension at position 1 - # 2. channnel x height x width: we should insert batch dimension at position 0, - # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 - # for simplicity, we insert a dimension of size 1 at position 1 for both cases - image = image.unsqueeze(1) - else: - # if it is a numpy array, it could have 2 possible shapes: - # 1. batch x height x width: insert channel dimension on last position - # 2. height x width x channel: insert batch dimension on first position - if image.shape[-1] == 1: - image = np.expand_dims(image, axis=0) - else: - image = np.expand_dims(image, axis=-1) - - if isinstance(image, supported_formats): - image = [image] - elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): - raise ValueError( - f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" - ) - - if isinstance(image[0], PIL.Image.Image): - if self.config.do_convert_rgb: - image = [self.convert_to_rgb(i) for i in image] - elif do_convert_grayscale: - image = [self.convert_to_grayscale(i) for i in image] - if self.config.do_resize: - height, width = self.get_height_width(image[0], height, width) - image = [self.resize(i, height, width) for i in image] - image = self.reshape(self.pil_to_numpy(image)) - else: - if isinstance(image[0], torch.Tensor): - image = [self.pt_to_numpy(elem) for elem in image] - image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) - else: - image = self.reshape(np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)) - - if do_convert_grayscale and image.ndim == 3: - image = np.expand_dims(image, 1) - - # don't need any preprocess if the image is latents - if image.shape[1] == 4: - return image - - if self.config.do_resize: - height, width = self.get_height_width(image, height, width) - image = self.resize(image, height, width) - - # expected range [0,1], normalize to [-1,1] - do_normalize = self.config.do_normalize - if image.min() < 0 and do_normalize: - warnings.warn( - "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] " - f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]", - FutureWarning, - ) - do_normalize = False - - if do_normalize: - image = self.normalize(image) - - if getattr(self.config, "do_binarize", False): - image = self.binarize(image) - - return image - - # Adapted from diffusers.VaeImageProcessor.postprocess - def postprocess( - self, - image: np.ndarray, - output_type: str = "pil", - do_denormalize: Optional[List[bool]] = None, - ): - if not isinstance(image, np.ndarray): - raise ValueError( - f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array" - ) - if output_type not in ["latent", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - warnings.warn(deprecation_message, FutureWarning) - output_type = "np" - - if output_type == "latent": - return image - - if do_denormalize is None: - do_denormalize = [self.config.do_normalize] * image.shape[0] - - image = np.stack( - [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0 - ) - - image = image.transpose((0, 2, 3, 1)) - - if output_type == "pil": - image = self.numpy_to_pil(image) - - return image - - def get_height_width( - self, - image: Union[PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, - ): - """ - This function return the height and width that are downscaled to the next integer multiple of - `vae_scale_factor`. - - Args: - image(`PIL.Image.Image`, `np.ndarray`): - The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have - shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should - have shape `[batch, channel, height, width]`. - height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the height of `image` input. - width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use the width of the `image` input. - """ - height = height or (image.height if isinstance(image, PIL.Image.Image) else image.shape[-2]) - width = width or (image.width if isinstance(image, PIL.Image.Image) else image.shape[-1]) - # resize to integer multiple of vae_scale_factor - width, height = (x - x % self.config.vae_scale_factor for x in (width, height)) - return height, width - - # Adapted from diffusers.VaeImageProcessor.numpy_to_pt - @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: - """ - Convert a NumPy image to a PyTorch tensor. - """ - if images.ndim == 3: - images = images[..., None] - - images = torch.from_numpy(images) - return images - - # Adapted from diffusers.VaeImageProcessor.pt_to_numpy - @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: - """ - Convert a PyTorch tensor to a NumPy image. - """ - images = images.cpu().float().numpy() - return images - - @staticmethod - def reshape(images: np.ndarray) -> np.ndarray: - """ - Reshape inputs to expected shape. - """ - if images.ndim == 3: - images = images[..., None] - - return images.transpose(0, 3, 1, 2) - - # TODO : remove after diffusers v0.21.0 release - def resize( - self, - image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: - """ - Resize image. - """ - if isinstance(image, PIL.Image.Image): - image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) - elif isinstance(image, torch.Tensor): - image = torch.nn.functional.interpolate(image, size=(height, width)) - elif isinstance(image, np.ndarray): - image = self.numpy_to_pt(image) - image = torch.nn.functional.interpolate(image, size=(height, width)) - image = self.pt_to_numpy(image) - return image diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py deleted file mode 100644 index b3cd622edac..00000000000 --- a/optimum/pipelines/diffusers/watermark.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -from imwatermark import WatermarkEncoder - - -WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110 -WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] - - -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12 -class StableDiffusionXLWatermarker: - def __init__(self): - self.watermark = WATERMARK_BITS - self.encoder = WatermarkEncoder() - self.encoder.set_watermark("bits", self.watermark) - - def apply_watermark(self, images: np.array): - # can't encode images that are smaller than 256 - if images.shape[-1] < 256: - return images - - # cv2 doesn't support float16 - if images.dtype == np.float16: - images = images.astype(np.float32) - - images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1)) - - images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2)) - - np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images) - - return images diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index d1471aa218a..7671d6cd2e6 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -43,7 +43,7 @@ from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.onnx.model_configs import WhisperOnnxConfig from optimum.exporters.onnx.utils import get_speecht5_models_for_export -from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig +from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig from optimum.utils.save_utils import maybe_load_preprocessors from optimum.utils.testing_utils import grid_parameters, require_diffusers @@ -292,27 +292,22 @@ def _onnx_export( gc.collect() - def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"): + def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device="cpu"): pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device) models_and_onnx_configs = get_diffusion_models_for_export(pipeline) - output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs] - model, _ = models_and_onnx_configs["vae_encoder"] - model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters} with TemporaryDirectory() as tmpdirname: _, onnx_outputs = export_models( models_and_onnx_configs=models_and_onnx_configs, opset=14, output_dir=Path(tmpdirname), - output_names=output_names, device=device, ) validate_models_outputs( models_and_onnx_configs=models_and_onnx_configs, onnx_named_outputs=onnx_outputs, output_dir=Path(tmpdirname), - atol=1e-3, - onnx_files_subpaths=output_names, + atol=1e-4, use_subprocess=False, ) @@ -403,7 +398,7 @@ def test_tensorflow_export( @require_vision @require_diffusers def test_pytorch_export_for_diffusion_models(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name) + self._onnx_export_diffusion_models(model_type, model_name) @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items()) @require_torch @@ -414,7 +409,7 @@ def test_pytorch_export_for_diffusion_models(self, model_type, model_name): @pytest.mark.run_slow @pytest.mark.gpu_test def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name, device="cuda") + self._onnx_export_diffusion_models(model_type, model_name, device="cuda") class CustomWhisperOnnxConfig(WhisperOnnxConfig): diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 9f480b2d1a0..956566f0e1f 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -12,10 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import unittest import numpy as np -import PIL import pytest import torch from diffusers import ( @@ -24,6 +22,7 @@ AutoPipelineForText2Image, DiffusionPipeline, ) +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image from parameterized import parameterized from transformers.testing_utils import require_torch_gpu @@ -35,8 +34,7 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm +from optimum.utils.testing_utils import grid_parameters, require_diffusers def get_generator(framework, seed): @@ -72,16 +70,8 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= return [image] * batch_size -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - class ORTPipelineForText2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -126,17 +116,16 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - height, width, batch_size = 64, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -150,61 +139,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - if model_arch == "latent-consistency": - # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step - # TODO: Investigate why this is the case - inputs["num_inference_steps"] = 1 - - for output_type in ["latent", "np"]: + for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) - self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 64, 32, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -220,7 +161,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -243,17 +184,21 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + height, width, batch_size = 128, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -263,9 +208,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -279,14 +221,11 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -295,9 +234,8 @@ def test_negative_prompt(self, model_arch: str): negative_prompt = ["This is a negative prompt"] pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - image_slice_1 = pipeline( - **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) - ).images[0, -3:, -3:, -1] + + images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images prompt = inputs.pop("prompt") if model_arch == "stable-diffusion-xl": @@ -306,39 +244,96 @@ def test_negative_prompt(self, model_arch: str): inputs["negative_prompt_embeds"], inputs["pooled_prompt_embeds"], inputs["negative_pooled_prompt_embeds"], - ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + ) = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) else: - text_ids = pipeline.tokenizer( - prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - negative_text_ids = pipeline.tokenizer( - negative_prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] - inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] - - image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] - - self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) + + images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + + outputs = pipeline(**inputs).images + + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image TASK = "image-to-image" - def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( @@ -369,11 +364,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -381,68 +371,18 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -455,7 +395,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -478,18 +418,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["np", "pil", "pt"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -499,27 +442,26 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -533,12 +475,73 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device.type, "cuda") + + outputs = pipeline(**inputs).images + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) class ORTPipelineForInpaintingTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -546,18 +549,16 @@ class ORTPipelineForInpaintingTest(ORTModelTestMixin): TASK = "inpainting" def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): - assert batch_size == 1, "Inpainting models only support batch_size=1" - assert input_type == "pil", "Inpainting models only support input_type='pil'" - inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" - )[0] + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) inputs["mask_image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" - )[0] + height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type + ) + inputs["strength"] = 0.75 inputs["height"] = height inputs["width"] = width @@ -583,11 +584,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -595,59 +591,14 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -664,7 +615,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -687,18 +638,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["pil"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -708,11 +662,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -722,23 +671,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - latents_shape = ( - batch_size, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - ort_output = ort_pipeline(**inputs, latents=np_latents).images - diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images - - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -756,38 +695,65 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device, "cuda") + + outputs = pipeline(**inputs).images + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index f6771ce7618..665f253c480 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -148,7 +148,7 @@ def __init__(self, *args, **kwargs): self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small" self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro" self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx" - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "optimum-internal-testing/tiny-stable-diffusion-onnx" def test_load_model_from_local_path(self): model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH) @@ -222,17 +222,17 @@ def test_load_seq2seq_model_from_empty_cache(self): @require_diffusers def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True ) - self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( @@ -325,6 +325,8 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test @@ -339,6 +341,8 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @require_ort_rocm @@ -354,6 +358,8 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): model = ORTStableDiffusionPipeline.from_pretrained( @@ -366,6 +372,8 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 17f3b391b04..5071d0081af 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -171,6 +171,11 @@ class ORTModelTestMixin(unittest.TestCase): "np": np.ndarray, } + TASK = None + + ORTMODEL_CLASS = None + AUTOMODEL_CLASS = None + @classmethod def setUpClass(cls): cls.onnx_model_dirs = {} From 2c0476eda1398b9a81cb966c817a460ed6e53413 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:49:38 +0200 Subject: [PATCH 12/50] Enable ONNX export for transformers 4.45 (#2045) * Enable ONNX export for transformers 4.45 * add comment * update setup --- optimum/exporters/onnx/convert.py | 11 +++++------ setup.py | 7 ++----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index f2bf95f3e3c..d72fd7eb21a 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,7 +26,6 @@ import numpy as np import onnx -import transformers from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -531,6 +530,11 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") FORCE_ONNX_EXTERNAL_DATA = os.getenv("FORCE_ONNX_EXTERNAL_DATA", "0") == "1" + model_kwargs = model_kwargs or {} + # num_logits_to_keep was added in transformers 4.45 and isn't added as inputs when exporting the model + if check_if_transformers_greater("4.44.99") and "num_logits_to_keep" in signature(model.forward).parameters.keys(): + model_kwargs["num_logits_to_keep"] = 0 + with torch.no_grad(): model.config.return_dict = True model = model.eval() @@ -1001,11 +1005,6 @@ def onnx_export_from_model( >>> onnx_export_from_model(model, output="gpt2_onnx/") ``` """ - if check_if_transformers_greater("4.44.99"): - raise ImportError( - f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}" - ) - TasksManager.standardize_model_attributes(model) if hasattr(model.config, "export_model_type"): diff --git a/setup.py b/setup.py index 0e2f0fd1bb6..63f202faa6e 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,6 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.45.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,10 +62,9 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.45.0", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"], + "exporters": ["onnx", "onnxruntime", "timm"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -77,7 +75,6 @@ "numpy<1.24.0", "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", - "transformers<4.45.0", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", From 1b5a63da593599b1e6e178754146e0109d3305d9 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:52:35 +0200 Subject: [PATCH 13/50] Remove the need for the config to be in the subfolder (#2044) * remove the need for the config to be in the subfolder * fix * fix for offline mode * add log * fix * enable load local model in subfolder * fix windows --- optimum/modeling_base.py | 36 ++++++++++++++++++----------- optimum/onnxruntime/modeling_ort.py | 6 ++--- tests/onnxruntime/test_modeling.py | 15 ++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 29521b7c0c6..48c738514ae 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -380,27 +380,35 @@ def from_pretrained( ) model_id, revision = model_id.split("@") + all_files, _ = TasksManager.get_model_files( + model_id, + subfolder=subfolder, + cache_dir=cache_dir, + revision=revision, + token=token, + ) + + config_folder = subfolder + if cls.config_name not in all_files: + logger.info( + f"{cls.config_name} not found in the specified subfolder {subfolder}. Using the top level {cls.config_name}." + ) + config_folder = "" + library_name = TasksManager.infer_library_from_model( - model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token ) if library_name == "timm": config = PretrainedConfig.from_pretrained( - model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token ) if config is None: - if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: - if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)): - config = AutoConfig.from_pretrained( - os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code - ) - elif CONFIG_NAME in os.listdir(model_id): + if os.path.isdir(os.path.join(model_id, config_folder)) and cls.config_name == CONFIG_NAME: + if CONFIG_NAME in os.listdir(os.path.join(model_id, config_folder)): config = AutoConfig.from_pretrained( - os.path.join(model_id, CONFIG_NAME), trust_remote_code=trust_remote_code - ) - logger.info( - f"config.json not found in the specified subfolder {subfolder}. Using the top level config.json." + os.path.join(model_id, config_folder), trust_remote_code=trust_remote_code ) else: raise OSError(f"config.json not found in {model_id} local folder") @@ -411,7 +419,7 @@ def from_pretrained( cache_dir=cache_dir, token=token, force_download=force_download, - subfolder=subfolder, + subfolder=config_folder, trust_remote_code=trust_remote_code, ) elif isinstance(config, (str, os.PathLike)): @@ -421,7 +429,7 @@ def from_pretrained( cache_dir=cache_dir, token=token, force_download=force_download, - subfolder=subfolder, + subfolder=config_folder, trust_remote_code=trust_remote_code, ) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 9b29afa566b..ce1d68536ac 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -510,13 +510,12 @@ def _from_pretrained( if file_name is None: if model_path.is_dir(): - onnx_files = list(model_path.glob("*.onnx")) + onnx_files = list((model_path / subfolder).glob("*.onnx")) else: repo_files, _ = TasksManager.get_model_files( model_id, revision=revision, cache_dir=cache_dir, token=token ) repo_files = map(Path, repo_files) - pattern = "*.onnx" if subfolder == "" else f"{subfolder}/*.onnx" onnx_files = [p for p in repo_files if p.match(pattern)] @@ -983,10 +982,9 @@ def _cached_file( token = use_auth_token model_path = Path(model_path) - # locates a file in a local folder and repo, downloads and cache it if necessary. if model_path.is_dir(): - model_cache_path = model_path / file_name + model_cache_path = model_path / subfolder / file_name preprocessors = maybe_load_preprocessors(model_path.as_posix()) else: model_cache_path = hf_hub_download( diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 665f253c480..501c7dac240 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -28,6 +28,7 @@ import requests import timm import torch +from huggingface_hub import HfApi from huggingface_hub.constants import default_cache_path from parameterized import parameterized from PIL import Image @@ -1263,6 +1264,20 @@ def test_trust_remote_code(self): torch.allclose(pt_logits, ort_logits, atol=1e-4), f" Maxdiff: {torch.abs(pt_logits - ort_logits).max()}" ) + @parameterized.expand(("", "onnx")) + def test_loading_with_config_not_from_subfolder(self, subfolder): + # config.json file in the root directory and not in the subfolder + model_id = "sentence-transformers-testing/stsb-bert-tiny-onnx" + # hub model + ORTModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=subfolder == "") + # local model + api = HfApi() + with tempfile.TemporaryDirectory() as tmpdirname: + local_dir = Path(tmpdirname) / "model" + api.snapshot_download(repo_id=model_id, local_dir=local_dir) + ORTModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=subfolder == "") + remove_directory(tmpdirname) + class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ From 851f04b13aab9f17f1c3b5080767a2cc440bb2b1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:54:01 +0200 Subject: [PATCH 14/50] Remove upper transformers version limit (#2048) --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 63f202faa6e..fb290274a3b 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29,<4.46.0", + "transformers[sentencepiece]>=4.29", "torch>=1.11", "packaging", "numpy", @@ -54,6 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", + "transformers<4.46.0", ], "onnxruntime-gpu": [ "onnx", @@ -62,9 +63,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. + "transformers<4.46.0", ], - "exporters": ["onnx", "onnxruntime", "timm"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", From 4ce73646eb13dff14503092eeb92f94d6a1ee7b1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:57:31 +0200 Subject: [PATCH 15/50] Dev version (#2049) --- optimum/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/version.py b/optimum/version.py index 4a8a7edab63..4fff28e5c97 100644 --- a/optimum/version.py +++ b/optimum/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.23.0.dev0" +__version__ = "1.24.0.dev0" From 6172e96914d6f49aec253db05c98d827c158caab Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:24:00 +0200 Subject: [PATCH 16/50] Fix doc build (#2050) * Fix doc build * Trigger PR doc build when the PR doc build workflow is modified * Fix issue with torch-xla and ubuntu-latest --- .github/workflows/build_main_documentation.yml | 8 ++++++-- .github/workflows/build_pr_documentation.yml | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index 20face917ab..11e36ed57f3 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -10,7 +10,7 @@ on: jobs: build_documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -66,7 +66,7 @@ jobs: sudo apt-get purge -y '^mysql.*' sudo apt-get purge -y '^java.*' sudo apt-get purge -y '^openjdk.*' - sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel + sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel df -h sudo apt-get autoremove -y >/dev/null 2>&1 sudo apt-get clean @@ -110,6 +110,8 @@ jobs: - name: Setup environment run: | + python -m venv venv-doc + source venv-doc/bin/activate pip uninstall -y doc-builder cd doc-builder git pull origin main @@ -135,6 +137,7 @@ jobs: - name: Make Furiosa documentation run: | + source venv-doc/bin/activate cd optimum-furiosa pip install . sudo apt install software-properties-common @@ -159,6 +162,7 @@ jobs: - name: Make TPU documentation run: | sudo docker system prune -a -f + source venv-doc/bin/activate cd optimum-tpu pip install -U pip pip install . -f https://storage.googleapis.com/libtpu-releases/index.html diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index e5f2dcb0d18..6eb09aff304 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -8,6 +8,7 @@ on: - "optimum/**.py" - "docs/**.mdx" - "docs/**.yml" + - ".github/workflows/build_pr_documentation.yml" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,7 +16,7 @@ concurrency: jobs: build_documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: COMMIT_SHA: ${{ github.event.pull_request.head.sha }} PR_NUMBER: ${{ github.event.number }} @@ -60,6 +61,8 @@ jobs: - name: Setup environment run: | + python -m venv venv-doc + source venv-doc/bin/activate pip uninstall -y doc-builder cd doc-builder git pull origin main @@ -99,6 +102,7 @@ jobs: - name: Make TPU documentation run: | sudo docker system prune -a -f + source venv-doc/bin/activate cd optimum-tpu pip install -U pip pip install . -f https://storage.googleapis.com/libtpu-releases/index.html From eb6f5de5ce3eb69f73d7a0ee0da30f9bd8ca2a08 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:27:47 +0200 Subject: [PATCH 17/50] Don't hardcode the logger level to INFO; let users set TRANSFORMERS_VERBOSITY (#2047) And keep the default as Warning, i.e. the expected for Python modules --- optimum/exporters/onnx/__main__.py | 1 - optimum/exporters/tflite/__main__.py | 1 - optimum/onnx/transformations_utils.py | 1 - 3 files changed, 3 deletions(-) diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py index 703e98df3e2..6a2cc6834a6 100644 --- a/optimum/exporters/onnx/__main__.py +++ b/optimum/exporters/onnx/__main__.py @@ -43,7 +43,6 @@ from .base import OnnxConfig logger = logging.get_logger() -logger.setLevel(logging.INFO) def main_export( diff --git a/optimum/exporters/tflite/__main__.py b/optimum/exporters/tflite/__main__.py index b3c90cb63f2..0c4c7b994fa 100644 --- a/optimum/exporters/tflite/__main__.py +++ b/optimum/exporters/tflite/__main__.py @@ -28,7 +28,6 @@ logger = logging.get_logger() -logger.setLevel(logging.INFO) def main(): diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py index 1f0765112e8..fe55a5a5770 100644 --- a/optimum/onnx/transformations_utils.py +++ b/optimum/onnx/transformations_utils.py @@ -29,7 +29,6 @@ logger = logging.get_logger() -logger.setLevel(logging.INFO) def _find_duplicate_initializers( From 690d35b1ab31f375f5a4b74bf6eba37517656c05 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:37:45 +0200 Subject: [PATCH 18/50] Add workflow to mark issues as stale (#2051) --- .github/workflows/stale.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/stale.yml diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 00000000000..a5e50a795b6 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,14 @@ +name: 'Close stale issues and PRs' +on: + schedule: + - cron: '30 1 * * *' + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v8 + with: + stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + days-before-stale: 30 + days-before-close: 5 From b42db7ee6b5fa43e41adcbd501a3bd183b589991 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:04:50 +0200 Subject: [PATCH 19/50] Fix onnx export CLI for transformers >= 4.45 (#2053) * fix onnx export * add test --- optimum/exporters/onnx/convert.py | 3 ++- tests/exporters/onnx/test_exporters_onnx_cli.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index d72fd7eb21a..565183b38fc 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,6 +26,7 @@ import numpy as np import onnx +from transformers.generation import GenerationMixin from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -1127,7 +1128,7 @@ def onnx_export_from_model( if check_if_transformers_greater("4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - if model.can_generate() and len(misplaced_generation_parameters) > 0: + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " f"{misplaced_generation_parameters}. You are seeing this warning because you've set " diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py index ed611ade04e..8b186e9307b 100644 --- a/tests/exporters/onnx/test_exporters_onnx_cli.py +++ b/tests/exporters/onnx/test_exporters_onnx_cli.py @@ -602,6 +602,14 @@ def test_diffusion(self): check=True, ) + def test_sentence_transformers(self): + with TemporaryDirectory() as tmpdirname: + subprocess.run( + f"python3 -m optimum.exporters.onnx --model sentence-transformers-testing/stsb-bert-tiny-onnx --task feature-extraction {tmpdirname}", + shell=True, + check=True, + ) + def test_legacy(self): with TemporaryDirectory() as tmpdirname: subprocess.run( From 94201540ac41b0a86042b04df0a3b374793761b8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:18:30 +0200 Subject: [PATCH 20/50] Fix onnx export for transformers>=4.45 (#2054) * fix onnx export for transformers>=4.45 * fix tets * style --- optimum/exporters/onnx/convert.py | 6 +++++- tests/exporters/onnx/test_exporters_onnx_cli.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 565183b38fc..2661d835979 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -1128,7 +1128,11 @@ def onnx_export_from_model( if check_if_transformers_greater("4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + if ( + isinstance(model, GenerationMixin) + and model.can_generate() + and len(misplaced_generation_parameters) > 0 + ): logger.warning( "Moving the following attributes in the config to the generation config: " f"{misplaced_generation_parameters}. You are seeing this warning because you've set " diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py index 8b186e9307b..9ac7832aa7d 100644 --- a/tests/exporters/onnx/test_exporters_onnx_cli.py +++ b/tests/exporters/onnx/test_exporters_onnx_cli.py @@ -602,6 +602,7 @@ def test_diffusion(self): check=True, ) + @require_sentence_transformers def test_sentence_transformers(self): with TemporaryDirectory() as tmpdirname: subprocess.run( From 0d808ade96b01e35e5c8a38b0b156ce4b241f433 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:23:17 +0200 Subject: [PATCH 21/50] Upgrade macOS image for tests compatibility with numpy v2 (#2055) * update runner environment * fix * downgrade * Update .github/workflows/test_bettertransformer.yml Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update .github/workflows/test_bettertransformer.yml Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * fix --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- .github/workflows/test_bettertransformer.yml | 5 ++--- .github/workflows/test_onnx.yml | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml index 080d8272dfc..b023fa4bd1b 100644 --- a/.github/workflows/test_bettertransformer.yml +++ b/.github/workflows/test_bettertransformer.yml @@ -15,9 +15,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-13] - exclude: [{ python-version: 3.8, os: macos-13 }] + python-version: [3.9] + os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 9aa8b307235..22a11720798 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-13] + os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} steps: From 058593927303ec94726c15a0cebf4da96ee628ec Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:45:17 +0200 Subject: [PATCH 22/50] Fix main doc build (#2057) --- .github/workflows/build_main_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index 11e36ed57f3..efd61c1fd4f 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -57,6 +57,7 @@ jobs: - name: Free disk space run: | df -h + sudo apt-get update sudo apt-get purge -y '^apache.*' sudo apt-get purge -y '^imagemagick.*' sudo apt-get purge -y '^dotnet.*' From 9fd9ca5505e83f67c2a5e4c4f6f56d5fcf28442f Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:54:15 +0200 Subject: [PATCH 23/50] Enter venv before pushing doc in main doc build workflow (#2058) --- .github/workflows/build_main_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index efd61c1fd4f..c922f5097da 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -197,6 +197,7 @@ jobs: - name: Push to repositories run: | + source venv-doc/bin/activate cd optimum/optimum-doc-build sudo chmod -R ugo+rwx optimum doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml From d11e2850158694258875b6e4fcba1c5db61af42a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:12:05 +0200 Subject: [PATCH 24/50] Fix tests expected environment variable name (#2059) * fix env variable name * fix test * comment * load onnx revision --- .github/workflows/test_onnxruntime.yml | 2 +- tests/onnxruntime/test_modeling.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 291a3b08335..a72bedb1ab7 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -51,7 +51,7 @@ jobs: - name: Test with pytest (in parallel) env: - FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} working-directory: tests run: | pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 501c7dac240..33243da278a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -974,9 +974,13 @@ def test_load_model_from_hub_private(self): token = os.environ.get("HF_HUB_READ_TOKEN", None) if token is None: - self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.") + self.skipTest( + "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`." + ) - model = ORTModelForCustomTasks.from_pretrained("optimum-internal-testing/tiny-random-phi-private", token=token) + model = ORTModelForCustomTasks.from_pretrained( + "optimum-internal-testing/tiny-random-phi-private", revision="onnx", token=token + ) self.assertIsInstance(model.model, onnxruntime.InferenceSession) self.assertIsInstance(model.config, PretrainedConfig) From 1e5014e70f17e0437c4b0a7f4e65e170688d8ab0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:12:17 +0200 Subject: [PATCH 25/50] Remove unused HF_TOKEN environment variable (#2061) remove unused HF_TOKEN environment variable --- .github/workflows/test_fx_automatic_parallel.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml index 05ebf7ea9e5..c5d82be38b3 100644 --- a/.github/workflows/test_fx_automatic_parallel.yml +++ b/.github/workflows/test_fx_automatic_parallel.yml @@ -35,7 +35,6 @@ jobs: options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/ env: NCCL_DEBUG: INFO - HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} defaults: run: shell: bash From 8e54205b3b6b45f10f6360c05bb3a560a27354fe Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:06:48 +0200 Subject: [PATCH 26/50] Fix compatibility with diffusers < 0.25.0 (#2063) * Fix compatibility with diffusers < 0.25.0 * fix import --- optimum/onnxruntime/modeling_diffusion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 87fcb68c7e9..3899a7b36b6 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -26,7 +26,6 @@ import numpy as np import torch from diffusers.configuration_utils import ConfigMixin -from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, @@ -52,6 +51,7 @@ from transformers.modeling_outputs import ModelOutput import onnxruntime as ort +from optimum.utils import check_if_diffusers_greater from ..exporters.onnx import main_export from ..onnx.utils import _get_model_external_data_paths @@ -73,6 +73,12 @@ ) +if check_if_diffusers_greater("0.25.0"): + from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +else: + from diffusers.models.vae import DiagonalGaussianDistribution + + logger = logging.getLogger(__name__) From 59d6f7f04e390fb13fcba62bf22cea6ff2030623 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:59:22 +0200 Subject: [PATCH 27/50] Clean up ORT documentation (#2065) * refactor ort doc * fix links * fix --- .../package_reference/modeling_ort.mdx | 5 + .../onnxruntime/usage_guides/models.mdx | 275 +++--------------- 2 files changed, 48 insertions(+), 232 deletions(-) diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx index 65b2b60195a..2c93ab3ac0d 100644 --- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx +++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx @@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks. ## Stable Diffusion +#### ORTDiffusionPipeline + +[[autodoc]] onnxruntime.ORTDiffusionPipeline + - __call__ + #### ORTStableDiffusionPipeline [[autodoc]] onnxruntime.ORTStableDiffusionPipeline diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 131822e9568..1292e755c06 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -4,263 +4,74 @@ Optimum is a utility package for building and running inference with accelerated Optimum can be used to load optimized models from the [Hugging Face Hub](hf.co/models) and create pipelines to run accelerated inference without rewriting your APIs. -## Switching from Transformers to Optimum -The `optimum.onnxruntime.ORTModelForXXX` model classes are API compatible with Hugging Face Transformers models. This -means you can just replace your `AutoModelForXXX` class with the corresponding `ORTModelForXXX` class in `optimum.onnxruntime`. +## Loading -You do not need to adapt your code to get it to work with `ORTModelForXXX` classes: +### Transformers models -```diff -from transformers import AutoTokenizer, pipeline --from transformers import AutoModelForQuestionAnswering -+from optimum.onnxruntime import ORTModelForQuestionAnswering - --model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") # PyTorch checkpoint -+model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") # ONNX checkpoint -tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") - -onnx_qa = pipeline("question-answering",model=model,tokenizer=tokenizer) - -question = "What's my name?" -context = "My name is Philipp and I live in Nuremberg." -pred = onnx_qa(question, context) -``` - -### Loading a vanilla Transformers model - -Because the model you want to work with might not be already converted to ONNX, [`~optimum.onnxruntime.ORTModel`] -includes a method to convert vanilla Transformers models to ONNX ones. Simply pass `export=True` to the -[`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly: - -```python ->>> from optimum.onnxruntime import ORTModelForSequenceClassification - ->>> # Load the model from the hub and export it to the ONNX format ->>> model = ORTModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased-finetuned-sst-2-english", export=True -... ) -``` - -### Pushing ONNX models to the Hugging Face Hub - -It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to push your `ORTModelForXXX` to the -[Hugging Face Model Hub](https://hf.co/models): - -```python ->>> from optimum.onnxruntime import ORTModelForSequenceClassification - ->>> # Load the model from the hub and export it to the ONNX format ->>> model = ORTModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased-finetuned-sst-2-english", export=True -... ) - ->>> # Save the converted model ->>> model.save_pretrained("a_local_path_for_convert_onnx_model") - -# Push the onnx model to HF Hub ->>> model.push_to_hub( # doctest: +SKIP -... "a_local_path_for_convert_onnx_model", repository_id="my-onnx-repo", use_auth_token=True -... ) -``` - -## Sequence-to-sequence models - -Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models -are exported to the ONNX format, they are decomposed into three parts that are later combined during inference: -- The encoder part of the model -- The decoder part of the model + the language modeling head -- The same decoder part of the model + language modeling head but taking and using pre-computed key / values as inputs and -outputs. This makes inference faster. - -Here is an example of how you can load a T5 model to the ONNX format and run inference for a translation task: - - -```python ->>> from transformers import AutoTokenizer, pipeline ->>> from optimum.onnxruntime import ORTModelForSeq2SeqLM - -# Load the model from the hub and export it to the ONNX format ->>> model_name = "t5-small" ->>> model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) - -# Create a pipeline ->>> onnx_translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) ->>> text = "He never went out without a book under his arm, and he often came back with two." ->>> result = onnx_translation(text) ->>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] -``` - -## Stable Diffusion - -Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models -are exported to the ONNX format, they are split into four components that are later combined during inference: -- The text encoder -- The U-NET -- The VAE encoder -- The VAE decoder - -Make sure you have 🤗 Diffusers installed. - -To install `diffusers`: -```bash -pip install diffusers -``` - -### Text-to-Image - -Here is an example of how you can load an ONNX Stable Diffusion model and run inference using ONNX Runtime: - -```python -from optimum.onnxruntime import ORTStableDiffusionPipeline - -model_id = "runwayml/stable-diffusion-v1-5" -pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, revision="onnx") -prompt = "sailing ship in storm by Leonardo da Vinci" -image = pipeline(prompt).images[0] -``` +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`. -To load your PyTorch model and convert it to ONNX on-the-fly, you can set `export=True`. - -```python -pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True) - -# Don't forget to save the ONNX model -save_directory = "a_local_path" -pipeline.save_pretrained(save_directory) -``` - -
- -
- -### Image-to-Image - -```python -import requests -import torch -from PIL import Image -from io import BytesIO -from optimum.onnxruntime import ORTStableDiffusionImg2ImgPipeline - -model_id = "runwayml/stable-diffusion-v1-5" -pipeline = ORTStableDiffusionImg2ImgPipeline.from_pretrained(model_id, revision="onnx") - -url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" - -response = requests.get(url) -init_image = Image.open(BytesIO(response.content)).convert("RGB") -init_image = init_image.resize((768, 512)) - -prompt = "A fantasy landscape, trending on artstation" - -image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] -image.save("fantasy_landscape.png") -``` - -### Inpaint - -```python -import PIL -import requests -import torch -from io import BytesIO -from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline - -model_id = "runwayml/stable-diffusion-inpainting" -pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, revision="onnx") - -def download_image(url): - response = requests.get(url) - return PIL.Image.open(BytesIO(response.content)).convert("RGB") - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" +```diff + from transformers import AutoTokenizer, pipeline +- from transformers import AutoModelForQuestionAnswering ++ from optimum.onnxruntime import ORTModelForQuestionAnswering -init_image = download_image(img_url).resize((512, 512)) -mask_image = download_image(mask_url).resize((512, 512)) +- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint ++ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + result = pipe("He never went out without a book under his arm") ``` +More information for all the supported `ORTModelForXxx` in our [documentation](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort) -## Stable Diffusion XL - -Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows: -```bash -pip install diffusers -pip install invisible-watermark>=0.2.0 -``` - -### Text-to-Image - -Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using ONNX Runtime : +### Diffusers models -```python -from optimum.onnxruntime import ORTStableDiffusionXLPipeline +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`. -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -base = ORTStableDiffusionXLPipeline.from_pretrained(model_id) -prompt = "sailing ship in storm by Leonardo da Vinci" -image = base(prompt).images[0] -# Don't forget to save the ONNX model -save_directory = "sd_xl_base" -base.save_pretrained(save_directory) +```diff +- from diffusers import DiffusionPipeline ++ from optimum.onnxruntime import ORTDiffusionPipeline + + model_id = "runwayml/stable-diffusion-v1-5" +- pipeline = DiffusionPipeline.from_pretrained(model_id) ++ pipeline = ORTDiffusionPipeline.from_pretrained(model_id, revision="onnx") + prompt = "sailing ship in storm by Leonardo da Vinci" + image = pipeline(prompt).images[0] ``` +## Converting your model to ONNX on-the-fly -### Image-to-Image - -Here is an example of how you can load a PyTorch SDXL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime for *image-to-image* : +In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly. +Simply pass `export=True` to the [`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly: ```python -from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline -from diffusers.utils import load_image - -model_id = "stabilityai/stable-diffusion-xl-refiner-1.0" -pipeline = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True) +>>> from optimum.onnxruntime import ORTModelForSequenceClassification -url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png" -image = load_image(url).convert("RGB") -prompt = "medieval castle by Caspar David Friedrich" -image = pipeline(prompt, image=image).images[0] -image.save("medieval_castle.png") +>>> # Load the model from the hub and export it to the ONNX format +>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english" +>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True) ``` -### Refining the image output - -The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model. +## Pushing your model to the Hub +You can also call `push_to_hub` directly on your model to upload it to the [Hub](https://hf.co/models). ```python -from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline - -model_id = "stabilityai/stable-diffusion-xl-refiner-1.0" -refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True) - -image = base(prompt=prompt, output_type="latent").images[0] -image = refiner(prompt=prompt, image=image[None, :]).images[0] -image.save("sailing_ship.png") -``` - - - -## Latent Consistency Models - -### Text-to-Image +>>> from optimum.onnxruntime import ORTModelForSequenceClassification -Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using ONNX Runtime : +>>> # Load the model from the hub and export it to the ONNX format +>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english" +>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True) -```python -from optimum.onnxruntime import ORTLatentConsistencyModelPipeline +>>> # Save the converted model locally +>>> output_dir = "a_local_path_for_convert_onnx_model" +>>> model.save_pretrained(output_dir) -model_id = "SimianLuo/LCM_Dreamshaper_v7" -pipeline = ORTLatentConsistencyModelPipeline.from_pretrained(model_id, export=True) -prompt = "sailing ship in storm by Leonardo da Vinci" -images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images -``` +# Push the onnx model to HF Hub +>>> model.push_to_hub(output_dir, repository_id="my-onnx-repo") # doctest: +SKIP +``` \ No newline at end of file From 8af46e53bd1321d325ea4e712e7da8aca98df49f Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Sat, 19 Oct 2024 20:37:48 +0200 Subject: [PATCH 28/50] Fix ort documentation code snippet (#2070) fix code snippet --- docs/source/onnxruntime/usage_guides/models.mdx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 1292e755c06..905e6632c05 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -13,11 +13,11 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op ```diff from transformers import AutoTokenizer, pipeline -- from transformers import AutoModelForQuestionAnswering -+ from optimum.onnxruntime import ORTModelForQuestionAnswering +- from transformers import AutoModelForCausalLM ++ from optimum.onnxruntime import ORTModelForCausalLM -- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint -+ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint +- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint ++ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @@ -74,4 +74,4 @@ You can also call `push_to_hub` directly on your model to upload it to the [Hub] # Push the onnx model to HF Hub >>> model.push_to_hub(output_dir, repository_id="my-onnx-repo") # doctest: +SKIP -``` \ No newline at end of file +``` From 58c3571156466c13ebc7e22d3405b376ab2d222b Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:48:29 +0200 Subject: [PATCH 29/50] Update the habana extra (#2077) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fb290274a3b..822d8be1b80 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ "nncf": "optimum-intel[nncf]>=1.18.0", "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0", "ipex": "optimum-intel[ipex]>=1.18.0", - "habana": ["optimum-habana", "transformers>=4.43.0,<4.44.0"], + "habana": ["optimum-habana", "transformers>=4.45.0,<4.46.0"], "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers>=4.36.2,<4.42.0"], "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers>=4.36.2,<4.42.0"], "graphcore": "optimum-graphcore", From 2e637be5d6b3e15c2b300130599bcec0f3e12ec8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:41:10 +0200 Subject: [PATCH 30/50] Add sentence-transformers and timm documentation example (#2072) * add sentence-transformers and timm example to documentation * replace with onnx models * rephrase --- .../onnxruntime/usage_guides/models.mdx | 58 ++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 905e6632c05..27ac446096b 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -9,7 +9,7 @@ to run accelerated inference without rewriting your APIs. ### Transformers models -Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`. +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModelForXxx` with the corresponding `ORTModelForXxx` class. ```diff from transformers import AutoTokenizer, pipeline @@ -29,7 +29,7 @@ More information for all the supported `ORTModelForXxx` in our [documentation](h ### Diffusers models -Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`. +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `DiffusionPipeline` with the corresponding `ORTDiffusionPipeline` class. ```diff @@ -43,6 +43,60 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op image = pipeline(prompt).images[0] ``` + +### Sentence Transformers models + +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModel` with the corresponding `ORTModelForFeatureExtraction` class. + +```diff + from transformers import AutoTokenizer +- from transformers import AutoModel ++ from optimum.onnxruntime import ORTModelForFeatureExtraction + + tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") ++ model = ORTModelForFeatureExtraction.from_pretrained("optimum/all-MiniLM-L6-v2") + inputs = tokenizer("This is an example sentence", return_tensors="pt") + outputs = model(**inputs) +``` + +You can also load your ONNX model directly using the [`sentence_transformers.SentenceTransformer`](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx) class, just make sure to have `sentence-transformers>=3.2` installed. If the model wasn't already converted to ONNX, it will be converted automatically on-the-fly. + +```diff + from sentence_transformers import SentenceTransformer + + model_id = "sentence-transformers/all-MiniLM-L6-v2" +- model = SentenceTransformer(model_id) ++ model = SentenceTransformer(model_id, backend="onnx") + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) +``` + + +### Timm models + +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `create_model` with the corresponding `ORTModelForImageClassification` class. + + +```diff + import requests + from PIL import Image +- from timm import create_model + from timm.data import resolve_data_config, create_transform ++ from optimum.onnxruntime import ORTModelForImageClassification + +- model = create_model("timm/mobilenetv3_large_100.ra_in1k", pretrained=True) ++ model = ORTModelForImageClassification.from_pretrained("optimum/mobilenetv3_large_100.ra_in1k") + transform = create_transform(**resolve_data_config(model.config.pretrained_cfg, model=model)) + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" + image = Image.open(requests.get(url, stream=True).raw) + inputs = transform(image).unsqueeze(0) + outputs = model(inputs) +``` + + + ## Converting your model to ONNX on-the-fly In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly. From 4a39ae0b1de05601c8a33f4a13c244bdd016db24 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:20:31 +0100 Subject: [PATCH 31/50] Create token type ids when not provided (#2081) * create token type ids when needed * add test --- optimum/onnxruntime/modeling_ort.py | 19 ++++++++++++++++++- tests/onnxruntime/test_modeling.py | 12 ++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index ce1d68536ac..8e5a814b689 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -931,7 +931,6 @@ def _prepare_onnx_inputs( self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray] ) -> Dict[str, np.ndarray]: onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx for input_name in self.input_names.keys(): onnx_inputs[input_name] = inputs.pop(input_name) @@ -1086,6 +1085,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1241,6 +1243,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1330,6 +1335,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1437,6 +1445,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1527,6 +1538,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1610,6 +1624,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 33243da278a..da450b8e31c 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2192,6 +2192,18 @@ def test_compare_to_io_binding(self, model_arch): gc.collect() + def test_default_token_type_ids(self): + model_id = MODEL_NAMES["bert"] + model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer("this is a simple input", return_tensors="np") + self.assertTrue("token_type_ids" in model.input_names) + token_type_ids = tokens.pop("token_type_ids") + outs = model(token_type_ids=token_type_ids, **tokens) + outs_without_token_type_ids = model(**tokens) + self.assertTrue(np.allclose(outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state)) + gc.collect() + class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin): # Multiple Choice tests are conducted on different models due to mismatch size in model's classifier From 6802a0c4e9868041aa825f629c5e983df96e3cab Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:56:28 +0100 Subject: [PATCH 32/50] Add transformers 4.46 compatiblity (#2078) * transformers 4.46 * setup * uupdate setup * fix t5 * update python (3.8 eol) * fix onnx test * fixed deberta, onnxruntime tests in series passing * fix bt * fixed t5_forward for real, because it's also used by blip-2 as well * fix Phi3 * fix opt * vision encoder decoder * fix setup * style * fix encoder decoder * fixed transformers branch * branch * allow 4.47 * remove patch * add opt * add test * fix OPT ONNX export and inference * add test * update setup * style * merge tests * update tes num beams * add test transformers version * add architectures depending on transformers * add warning * revert * update test generation length * style --------- Co-authored-by: IlyasMoutawwakil --- .github/workflows/check_code_quality.yml | 2 +- .github/workflows/test_benckmark.yml | 30 +- .github/workflows/test_cli.yml | 4 +- .github/workflows/test_export_onnx.yml | 44 +-- .github/workflows/test_export_onnx_cli.yml | 30 +- .../workflows/test_export_onnx_cli_timm.yml | 26 +- .github/workflows/test_export_onnx_timm.yml | 27 +- .github/workflows/test_exporters_common.yml | 2 +- .github/workflows/test_exporters_slow.yml | 2 +- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_offline.yml | 2 +- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 13 +- .github/workflows/test_onnxruntime_slow.yml | 2 +- .github/workflows/test_optimum_common.yml | 39 +-- .github/workflows/test_utils.yml | 2 +- optimum/bettertransformer/models/attention.py | 326 ++++++++++++------ .../models/decoder_models.py | 4 +- optimum/bettertransformer/transformation.py | 36 +- optimum/exporters/onnx/model_configs.py | 49 ++- optimum/exporters/onnx/model_patcher.py | 3 +- optimum/exporters/onnx/utils.py | 6 +- optimum/onnxruntime/modeling_decoder.py | 4 +- optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 16 + setup.py | 24 +- tests/bettertransformer/test_audio.py | 20 +- tests/bettertransformer/test_common.py | 12 +- tests/bettertransformer/test_decoder.py | 8 +- tests/bettertransformer/test_encoder.py | 4 +- .../bettertransformer/test_encoder_decoder.py | 2 +- tests/bettertransformer/test_gpu.py | 4 +- tests/bettertransformer/testing_utils.py | 18 +- tests/onnx/test_onnx_export_custom_module.py | 17 +- tests/onnxruntime/test_modeling.py | 61 ++-- tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 36 files changed, 541 insertions(+), 304 deletions(-) diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml index c429b706bff..861684cfa4d 100644 --- a/.github/workflows/check_code_quality.yml +++ b/.github/workflows/check_code_quality.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index 7f7f2ace329..e859e845d64 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -4,9 +4,9 @@ name: Benchmark suite / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,20 +17,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install wheel - pip install .[tests,onnxruntime,benchmark] - - name: Test with unittest - run: | - python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install wheel + pip install .[tests,onnxruntime,benchmark] + - name: Test with unittest + run: | + python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index ecb19d23aa3..2efab40aab6 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} @@ -34,7 +34,7 @@ jobs: run: | pip install --upgrade pip pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests,exporters,exporters-tf] + pip install .[tests,exporters-tf] - name: Test with pytest run: | diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml index 56ef674cb41..0cd19a1724c 100644 --- a/.github/workflows/test_export_onnx.yml +++ b/.github/workflows/test_export_onnx.yml @@ -2,9 +2,9 @@ name: Exporters ONNX / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,27 +15,27 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 - - name: Install dependencies for tensorflow export - run: | - pip install .[tests,exporters-tf] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 + - name: Install dependencies for tensorflow export + run: | + pip install .[tests,exporters-tf] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml index 8fa4ebb045f..618a140c147 100644 --- a/.github/workflows/test_export_onnx_cli.yml +++ b/.github/workflows/test_export_onnx_cli.yml @@ -2,9 +2,9 @@ name: Exporters ONNX CLI / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,20 +15,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli_timm.yml b/.github/workflows/test_export_onnx_cli_timm.yml index 76a535fcebd..b92d5551ba1 100644 --- a/.github/workflows/test_export_onnx_cli_timm.yml +++ b/.github/workflows/test_export_onnx_cli_timm.yml @@ -14,20 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_timm.yml b/.github/workflows/test_export_onnx_timm.yml index 339e3e93dec..c16d20fbc18 100644 --- a/.github/workflows/test_export_onnx_timm.yml +++ b/.github/workflows/test_export_onnx_timm.yml @@ -14,21 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml index 8e8c3360c1f..11f6038afe4 100644 --- a/.github/workflows/test_exporters_common.yml +++ b/.github/workflows/test_exporters_common.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml index b22fdd7fd2a..453389d63fa 100644 --- a/.github/workflows/test_exporters_slow.yml +++ b/.github/workflows/test_exporters_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index f0366cf0d1e..a4e6dd3cd29 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml index 90b0108e512..20911fe6db8 100644 --- a/.github/workflows/test_offline.yml +++ b/.github/workflows/test_offline.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 22a11720798..dd1f3bee63d 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index a72bedb1ab7..0ab95752d01 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -17,8 +17,11 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + transformers-version: ["latest"] os: [ubuntu-20.04, windows-2019, macos-13] + include: + - transformers-version: "4.45.*" + os: ubuntu-20.04 runs-on: ${{ matrix.os }} steps: @@ -33,10 +36,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} + - name: Setup Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 - name: Install dependencies run: | @@ -44,6 +47,10 @@ jobs: pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[tests,onnxruntime] + - name: Install transformers ${{ matrix.transformers-version }} + if: ${{ matrix.transformers-version != 'latest' }} + run: pip install transformers==${{ matrix.transformers-version }} + - name: Test with pytest (in series) working-directory: tests run: | diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index 20371f79150..c5679e5b307 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index ded149c9b69..5ad42807a5f 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -4,9 +4,9 @@ name: Optimum common / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,25 +17,24 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install .[tests] - ls -l optimum/ - - name: Test with unittest - shell: bash - run: | - # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. - export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} - pytest tests/test_*.py - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[tests] + ls -l optimum/ + - name: Test with unittest + shell: bash + run: | + # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. + export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} + pytest tests/test_*.py diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 1ef33ced086..b5f2e27fc6a 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-13] - python-version: [3.8, 3.9] + python-version: [3.9] runs-on: ${{ matrix.os }} steps: diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 22b8faf1c21..c8c91a04e4e 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -387,137 +387,243 @@ def opt_forward( # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward -def t5_forward( - self, - hidden_states, - mask=None, - key_value_states=None, - position_bias=None, - past_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, - output_attentions=False, - **kwargs, -): - raise_on_head_mask(layer_head_mask) +if check_if_transformers_greater("4.45.99"): - if output_attentions is True: - raise ValueError("output_attentions=True can not be supported with BetterTransformer.") - if len(self.pruned_heads) > 0: - raise ValueError(f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}.") - batch_size, seq_length = hidden_states.shape[:2] - - real_seq_length = seq_length - - if past_key_value is not None: - assert ( - len(past_key_value) == 2 - ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) + batch_size, seq_length = hidden_states.shape[:2] + + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None + + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True + + if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, seq_length, key_length), device=query_states.device, dtype=query_states.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=query_states.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] + + if mask is not None: + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias_masked, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) + attn_output = self.o(attn_output) + + outputs = (attn_output, past_key_value, position_bias) + + return outputs + +else: + + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + **kwargs, + ): + raise_on_head_mask(layer_head_mask) + + if output_attentions is True: + raise ValueError("output_attentions=True can not be supported with BetterTransformer.") + if len(self.pruned_heads) > 0: + raise ValueError( + f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}." + ) + + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + assert ( + len(past_key_value) == 2 + ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" if key_value_states is None: # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: # cross-attn # (batch_size, n_heads, seq_length, dim_per_head) hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - # get key/value states - key_states = project( - hidden_states, - self.k, - key_value_states, - past_key_value[0] if past_key_value is not None else None, - ) - value_states = project( - hidden_states, - self.v, - key_value_states, - past_key_value[1] if past_key_value is not None else None, - ) + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + elif past_key_value.shape[2] != key_value_states.shape[1]: + # checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, + self.k, + key_value_states, + past_key_value[0] if past_key_value is not None else None, + ) + value_states = project( + hidden_states, + self.v, + key_value_states, + past_key_value[1] if past_key_value is not None else None, + ) - dropout_p = self.dropout if self.training else 0.0 - query_states = self.scale * query_states - if position_bias is None and not self.has_relative_attention_bias: - if mask is None: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=None, dropout_p=dropout_p, is_causal=False - ) - elif mask is not None: + dropout_p = self.dropout if self.training else 0.0 + query_states = self.scale * query_states + if position_bias is None and not self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=mask, dropout_p=dropout_p, is_causal=False ) - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), - device=value_states.device, - dtype=value_states.dtype, - ) - if self.gradient_checkpointing and self.training: - position_bias.requires_grad = True + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), + device=value_states.device, + dtype=value_states.dtype, + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.has_relative_attention_bias: + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias, + dropout_p=dropout_p, + is_causal=False, + ) else: - position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - - if self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False ) - else: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False - ) - attn_output = unshape(attn_output) # (batch_size, seq_length, dim) - attn_output = self.o(attn_output) + attn_output = unshape(attn_output) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) - return outputs + return outputs # Adapted from transformers.models.bart.modeling_bart.BartAttention.forward diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 52d28d076d3..e8045e695c1 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -327,9 +327,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): setattr(self, "relative_attention_bias", layer.relative_attention_bias) self.original_layers_mapping["relative_attention_bias"] = "relative_attention_bias" - self.module_mapping = None - + self.layer_idx = getattr(layer, "layer_idx", None) self.is_decoder = layer.is_decoder + self.module_mapping = None def forward(self, *args, **kwargs): return t5_forward(self, *args, **kwargs) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index a101757b6fa..b138862752e 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -20,7 +20,13 @@ import torch from packaging.version import parse -from ..utils import check_if_pytorch_greater, is_accelerate_available, recurse_getattr, recurse_setattr +from ..utils import ( + check_if_pytorch_greater, + check_if_torch_greater, + is_accelerate_available, + recurse_getattr, + recurse_setattr, +) from .models import BetterTransformerManager @@ -213,15 +219,18 @@ def transform( hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( - f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention." + f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." ) - # Check if we have to load the model using `accelerate` - if hasattr(model, "hf_device_map"): - load_accelerate = True - hf_device_map = model.hf_device_map - else: - load_accelerate = False + if hasattr(hf_config, "_attn_implementation") and hf_config._attn_implementation == "sdpa": + raise ValueError( + "This model already uses BetterTransformer optimizations from Transformers (torch.nn.functional.scaled_dot_product_attention). " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." + ) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: raise Exception( @@ -241,11 +250,20 @@ def transform( f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}." ) - if parse(torch.__version__) <= parse("1.14"): + if not check_if_torch_greater("2.0"): raise ValueError( f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch." ) + hf_config = model.config + + # Check if we have to load the model using `accelerate` + if hasattr(model, "hf_device_map"): + load_accelerate = True + hf_device_map = model.hf_device_map + else: + load_accelerate = False + if load_accelerate: # Remove the hooks from the original model to avoid weights being on `meta` device. remove_hook_from_module(model, recurse=True) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e77f649f69b..9e57128c272 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -155,7 +155,7 @@ class SplinterOnnxConfig(BertOnnxConfig): class DistilBertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0 @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -266,10 +266,18 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig -class OPTOnnxConfig(TextDecoderOnnxConfig): - # OPT does not require position_ids input. - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46 +if check_if_transformers_greater("4.45.99"): + + class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + +else: + + class OPTOnnxConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): @@ -304,6 +312,15 @@ class Phi3OnnxConfig(PhiOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") + def __init__(self, *args, **kwargs): + # TODO : replace check_if_transformers_greater with is_transformers_available + if check_if_transformers_greater("4.46.0") and not check_if_transformers_greater("4.46.1"): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Phi3 model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) + super().__init__(*args, **kwargs) + class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 @@ -480,7 +497,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class T5OnnxConfig(TextSeq2SeqOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 # T5 uses aten::triu that requires opset>=14 DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES[:-1] + ( T5DummySeq2SeqPastKeyValuesGenerator, ) @@ -2027,6 +2044,7 @@ class TrOCROnnxConfig(TextSeq2SeqOnnxConfig): class VisionEncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig ATOL_FOR_VALIDATION = 1e-3 + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator) @@ -2156,8 +2174,21 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast): DummySeq2SeqPastKeyValuesGenerator, DummyPix2StructInputGenerator, ) - # Min operator needs to support int64, which is the case for opset>=12 - DEFAULT_ONNX_OPSET = 12 + + DEFAULT_ONNX_OPSET = 14 # use 'aten::triu' now which is opset 14 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO : replace check_if_transformers_greater with is_transformers_available + if ( + check_if_transformers_greater("4.46.0") + and not check_if_transformers_greater("4.46.1") + and self._behavior is ConfigBehavior.DECODER + ): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) @property def inputs(self): @@ -2310,3 +2341,5 @@ def overwrite_shape_and_generate_input( class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig + + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 34ed5fcae46..fdfb0e280f5 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -34,11 +34,10 @@ if _transformers_version > version.parse("4.34.99"): - from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask + from transformers.modeling_attn_mask_utils import AttentionMaskConverter if _transformers_version >= version.parse("4.36"): from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa else: - _prepare_4d_causal_attention_mask = None _prepare_4d_causal_attention_mask_for_sdpa = None AttentionMaskConverter = None diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 675566ba23e..56249bbf5c3 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -27,7 +27,7 @@ is_diffusers_available, logging, ) -from ...utils.import_utils import _diffusers_version +from ...utils.import_utils import _diffusers_version, check_if_transformers_greater from ..utils import ( _get_submodels_and_export_configs, ) @@ -89,6 +89,10 @@ } +if check_if_transformers_greater("4.45.99"): + MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt") + + def check_onnxruntime_requirements(minimum_version: version.Version): """ Checks that ONNX Runtime is installed and if version is recent enough. diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index bda3ec98d9a..984d7f22ebf 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -582,7 +582,8 @@ def _from_pretrained( init_cls = ORTFalconForCausalLM elif config.model_type == "mpt": init_cls = ORTMPTForCausalLM - elif config.model_type == "opt": + # if model was exported with position_ids it means the model was exported with transformers >= v4.46 + elif config.model_type == "opt" and "position_ids" not in input_dims: init_cls = ORTOPTForCausalLM elif config.model_type == "gpt_bigcode": init_cls = ORTGPTBigCodeForCausalLM @@ -839,7 +840,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) - return { "input_ids": input_ids, "past_key_values": past_key_values, diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 5d5044e63e1..db7d1f6975d 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -29,6 +29,7 @@ TRANSFORMERS_MINIMUM_VERSION, check_if_diffusers_greater, check_if_pytorch_greater, + check_if_torch_greater, check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 4a57fda79ce..35a6294ab52 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -193,6 +193,22 @@ def check_if_diffusers_greater(target_version: str) -> bool: return version.parse(_diffusers_version) >= version.parse(target_version) +def check_if_torch_greater(target_version: str) -> bool: + """ + Checks whether the current install of torch is greater than or equal to the target version. + + Args: + target_version (str): version used as the reference for comparison. + + Returns: + bool: whether the check is True or not. + """ + if not is_torch_available(): + return False + + return torch_version >= version.parse(target_version) + + @contextmanager def require_numpy_strictly_lower(package_version: str, message: str): if not version.parse(np.__version__) < version.parse(package_version): diff --git a/setup.py b/setup.py index 822d8be1b80..82892bfcc8c 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29", + "transformers>=4.29", "torch>=1.11", "packaging", "numpy", @@ -37,9 +37,9 @@ "diffusers>=0.17.0", "torchaudio", "einops", - "invisible-watermark", "timm", "scikit-learn", + "sentencepiece", "rjieba", ] @@ -54,7 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.46.0", + "transformers<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,10 +63,20 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.46.0", + "transformers<4.47.0", + ], + "exporters": [ + "onnx", + "onnxruntime", + "timm", + "transformers<4.47.0", + ], + "exporters-gpu": [ + "onnx", + "onnxruntime-gpu", + "timm", + "transformers<4.47.0", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -76,7 +86,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers[sentencepiece]>=4.26,<4.38", + "transformers>=4.26,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py index be01a92d447..caca91e27ca 100644 --- a/tests/bettertransformer/test_audio.py +++ b/tests/bettertransformer/test_audio.py @@ -35,7 +35,7 @@ class TestsWhisper(unittest.TestCase): def test_error_message(self): - model = AutoModel.from_pretrained("openai/whisper-tiny") + model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager") with self.assertRaises(ValueError) as cm: model = BetterTransformer.transform(model) @@ -82,15 +82,19 @@ def _test_fp16_inference( set_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -147,7 +151,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int): model_id = MODELS_DICT[model_type] processor = AutoProcessor.from_pretrained(model_id) - model = AutoModel.from_pretrained(model_id) + model = AutoModel.from_pretrained(model_id, attn_implementation="eager") text = ["This is me and me"] if batch_size > 1: @@ -217,14 +221,14 @@ def test_logits(self, model_type: str): inputs = self.prepare_inputs_for_class(model_id, model_type) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config torch.manual_seed(0) converted_model = BetterTransformer.transform(hf_random_model) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config self.assertFalse( diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py index 35b89d2ed2e..b8bc0a3b3d9 100644 --- a/tests/bettertransformer/test_common.py +++ b/tests/bettertransformer/test_common.py @@ -28,7 +28,7 @@ class BetterTransformerIntegrationTests(unittest.TestCase): def test_raise_error_on_double_transform_call(self): - model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel") + model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager") with self.assertRaises(Exception) as cm: bt_model = BetterTransformer.transform(model) @@ -59,7 +59,7 @@ def test_raise_on_save(self, model_type: str): ) for model_id in model_ids: with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_model, keep_original_model=False) bt_model.save_pretrained(tmpdirname) @@ -73,7 +73,7 @@ def test_conversion(self, model_type: str): MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],) ) for model_id in model_ids: - hf_random_model = AutoModel.from_pretrained(model_id) + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") converted_model = BetterTransformer.transform(hf_random_model) self.assertTrue( @@ -99,7 +99,7 @@ def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep ) for model_id in model_ids: # get hf and bt model - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") # get bt model and invert it bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -145,9 +145,11 @@ def test_raise_activation_fun(self, model_type: str): )() # random config class for the model to test hf_random_config.hidden_act = "silu" - hf_random_model = AutoModel.from_config(hf_random_config).eval() + hf_random_model = AutoModel.from_config(hf_random_config, attn_implementation="eager").eval() + with self.assertRaises(ValueError) as cm: _ = BetterTransformer.transform(hf_random_model, keep_original_model=True) + self.assertTrue("Activation function" in str(cm.exception)) def test_dict_class_consistency(self): diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py index bab8f376fcc..e2bc6ddc2fb 100644 --- a/tests/bettertransformer/test_decoder.py +++ b/tests/bettertransformer/test_decoder.py @@ -131,7 +131,7 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in model_id = MODELS_DICT[model_type] - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) @@ -167,7 +167,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: if tokenizer.eos_token != "": @@ -224,7 +224,9 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina @require_torch_gpu @require_accelerate def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None): - hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModelForCausalLM.from_pretrained( + "gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py index 74aacaed58c..7dd42c43b05 100644 --- a/tests/bettertransformer/test_encoder.py +++ b/tests/bettertransformer/test_encoder.py @@ -181,7 +181,9 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m If this works for roberta, it should work for all other models too. """ - hf_model = AutoModel.from_pretrained("xlm-roberta-base", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModel.from_pretrained( + "xlm-roberta-base", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py index 8d05923522a..5ce4d62b12c 100644 --- a/tests/bettertransformer/test_encoder_decoder.py +++ b/tests/bettertransformer/test_encoder_decoder.py @@ -153,7 +153,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + model = AutoModelForSeq2SeqLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/tests/bettertransformer/test_gpu.py b/tests/bettertransformer/test_gpu.py index b992b90d3c8..ada38e408fa 100644 --- a/tests/bettertransformer/test_gpu.py +++ b/tests/bettertransformer/test_gpu.py @@ -26,7 +26,9 @@ def timing_cuda(model, num_batches, input_ids, masks, decoder_input_ids): def benchmark(model_name: str, num_batches: int, batch_size: int, max_seqlen: int, is_half: bool): - hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16 if is_half else None).eval() + hf_model = AutoModel.from_pretrained( + model_name, torch_dtype=torch.float16 if is_half else None, attn_implementation="eager" + ).eval() hf_model = hf_model.to("cuda:0") bt_model = BetterTransformer.transform(hf_model, keep_original_model=True) diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index 098882180aa..f79cbb34512 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -136,10 +136,12 @@ def _test_fp16_inference( torch.manual_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -169,7 +171,7 @@ def _test_fp16_inference( def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs): inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config # I could not obtain reproducible results with `torch.manual_seed` nor with @@ -309,7 +311,7 @@ def _test_train_decoder(self, model_id: str, model_type: str, **kwargs): """ inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) bt_model.train() @@ -328,7 +330,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): r""" Test that the inverse converted model and hf model have the same modules """ - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_modules = list(hf_model.modules()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -349,7 +351,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): def _test_save_load_invertible(self, model_id, keep_original_model=True): with tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() hf_model_state_dict = copy.deepcopy(hf_model.state_dict()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -362,7 +364,7 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True): # saving a normal transformers bark model fails because of shared tensors bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark") - bt_model_from_load = AutoModel.from_pretrained(tmpdirname) + bt_model_from_load = AutoModel.from_pretrained(tmpdirname, attn_implementation="eager") self.assertEqual( set(bt_model.state_dict().keys()), @@ -397,7 +399,7 @@ def _test_invert_model_logits( """ inputs = self.prepare_inputs_for_class(model_id, model_type=model_type, **preprocessor_kwargs) - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_model = hf_model.eval() with torch.inference_mode(): diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py index a144d5cd840..4398c14f01d 100644 --- a/tests/onnx/test_onnx_export_custom_module.py +++ b/tests/onnx/test_onnx_export_custom_module.py @@ -24,6 +24,8 @@ import torch from transformers.models.deberta import modeling_deberta + from optimum.utils import check_if_torch_greater + class StableDropoutTestCase(TestCase): """Tests export of StableDropout module.""" @@ -50,8 +52,8 @@ def test_training(self): training=training, ) - # Expected to fail with opset_version < 12 - with self.assertRaises(Exception): + if check_if_torch_greater("2.5"): + # Expected to pass with opset_version < 12 on torch >= 2.5 torch.onnx.export( sd, input, @@ -60,3 +62,14 @@ def test_training(self): do_constant_folding=do_constant_folding, training=training, ) + else: + # Expected to fail with opset_version < 12 on torch < 2.5 + with self.assertRaises(Exception): + torch.onnx.export( + sd, + input, + devnull, + opset_version=11, + do_constant_folding=do_constant_folding, + training=training, + ) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index da450b8e31c..597eb581e2a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -54,6 +54,7 @@ AutoModelForTokenClassification, AutoModelForVision2Seq, AutoTokenizer, + GenerationConfig, MBartForConditionalGeneration, Pix2StructForConditionalGeneration, # Pix2Struct does not work with AutoModel PretrainedConfig, @@ -106,7 +107,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.import_utils import is_diffusers_available +from optimum.utils.import_utils import check_if_transformers_greater, is_diffusers_available from optimum.utils.testing_utils import ( grid_parameters, remove_directory, @@ -2326,10 +2327,12 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "llama", "mistral", "mpt", - "phi3", - "qwen2", + "opt", ] + if check_if_transformers_greater("4.40"): + SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [False, True], @@ -2338,7 +2341,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): ORTMODEL_CLASS = ORTModelForCausalLM TASK = "text-generation" - GENERATION_LENGTH = 100 + GENERATION_LENGTH = 90 SPEEDUP_CACHE = 1.1 @parameterized.expand([(False,), (True,)]) @@ -2411,7 +2414,7 @@ def test_merge_from_onnx_and_save(self, model_arch): self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents) self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents) - @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 3]})) + @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 4]})) def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, num_beams: int): use_io_binding = None if use_cache is False: @@ -2474,25 +2477,39 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers new_tokens = 5 - onnx_outputs = onnx_model.generate( - **tokens, - num_beams=num_beams, - do_sample=False, - min_new_tokens=new_tokens, - max_new_tokens=new_tokens, - eos_token_id=None, - ) + gen_kwargs = { + "max_new_tokens": new_tokens, + "min_new_tokens": new_tokens, + "eos_token_id": None, + "num_beams": num_beams, + } - transformers_outputs = transformers_model.generate( - **tokens, - num_beams=num_beams, - do_sample=False, - min_new_tokens=new_tokens, - max_new_tokens=new_tokens, - eos_token_id=None, - ) + beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) + + if use_cache and num_beams == 4: + beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) + group_beam_search_gen_config = GenerationConfig( + do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs + ) + gen_configs = ( + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + ) + else: + gen_configs = (beam_search_gen_config,) - self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs)) + for gen_config in gen_configs: + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + set_seed(SEED) + onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) + + self.assertTrue( + torch.equal(onnx_outputs, transformers_outputs), + f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}", + ) gc.collect() diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 5071d0081af..e3d54237857 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -125,6 +125,7 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "lewtun/tiny-random-mt5", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver", "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv", From 7e8d857d1ed6be32046324bf8f424690f116b4e9 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 31 Oct 2024 14:54:05 -0600 Subject: [PATCH 33/50] Add ONNX export support for granite models (#2043) * feat(exporters/onnx): Add GraniteOnnxConfig and task support list Branch: OnnxGranite Signed-off-by: Gabe Goodhart * feat: Add granite's normalized config for inference Branch: OnnxGranite Signed-off-by: Gabe Goodhart * feat(onnx opt): Add onnx optimization support for granite Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(onnx/granite): Use LlamaOnnxConfig as the base for GraniteOnnxConfig Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(onnxruntime): Add "granite" to list of model types with grouped attention Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix: Add granite to the list of models that require position_ids Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(granite): Add MIN_TORCH_VERSION for recently fixed torch bug https://github.com/huggingface/optimum/pull/2043#issuecomment-2427975461 Branch: OnnxGranite Signed-off-by: Gabe Goodhart * test(granite): Add tiny random granite test for onnx exporter Branch: OnnxGranite Signed-off-by: Gabe Goodhart * tests(onnxruntime): Add granite to onnxruntime tests Branch: OnnxGranite Signed-off-by: Gabe Goodhart --------- Signed-off-by: Gabe Goodhart --- optimum/exporters/onnx/model_configs.py | 5 +++++ optimum/exporters/onnx/utils.py | 1 + optimum/exporters/tasks.py | 7 +++++++ optimum/onnxruntime/modeling_decoder.py | 2 +- optimum/onnxruntime/utils.py | 1 + optimum/utils/normalized_config.py | 1 + tests/exporters/exporters_utils.py | 1 + tests/onnxruntime/test_modeling.py | 1 + tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 9 files changed, 19 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 9e57128c272..cc752779d30 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -298,6 +298,11 @@ class GemmaOnnxConfig(LlamaOnnxConfig): pass +class GraniteOnnxConfig(LlamaOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + MIN_TORCH_VERSION = version.parse("2.5.0") + + class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 # Phi now uses F.scaled_dot_product_attention by default for torch>=2.1.1. NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 56249bbf5c3..19e24f88743 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -86,6 +86,7 @@ "phi", "phi3", "qwen2", + "granite", } diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index a489f34fb06..fdc8bfcb539 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -915,6 +915,13 @@ class TasksManager: "text-classification", onnx="LlamaOnnxConfig", ), + "granite": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="GraniteOnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 984d7f22ebf..8f1d062221a 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -340,7 +340,7 @@ def prepare_past_key_values( if self.model_type == "gemma": num_attention_heads = self.normalized_config.num_key_value_heads embed_size_per_head = self.normalized_config.head_dim - elif self.model_type in {"mistral", "llama", "qwen2"}: + elif self.model_type in {"mistral", "llama", "qwen2", "granite"}: num_attention_heads = self.normalized_config.num_key_value_heads else: num_attention_heads = self.normalized_config.num_attention_heads diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 128e2406f11..9e92e0bd325 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -128,6 +128,7 @@ class ORTConfigManager: "gpt-neo": "gpt2", "gpt-neox": "gpt2", "gptj": "gpt2", + "granite": "gpt2", # longt5 with O4 results in segmentation fault "longt5": "bert", "llama": "gpt2", diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 81207b76496..9ceed24c2dd 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -281,6 +281,7 @@ class NormalizedConfigManager: "xlm-roberta": NormalizedTextConfig, "yolos": NormalizedVisionConfig, "qwen2": NormalizedTextConfig, + "granite": NormalizedTextConfigWithGQA, } @classmethod diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index c8a33b0be35..ccccb5510bf 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -100,6 +100,7 @@ "gpt-neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt-neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", "ibert": "hf-internal-testing/tiny-random-IBertModel", "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 597eb581e2a..a335e014478 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2324,6 +2324,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "gpt_neo", "gpt_neox", "gptj", + "granite", "llama", "mistral", "mpt", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index e3d54237857..9f200e69b3d 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -104,6 +104,7 @@ "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM", + "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", From 35eebfe62bf721bbab365f569bd0c73057239732 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:01:46 +0100 Subject: [PATCH 34/50] Drop python 3.8 (#2086) * drop python 3.8 * fix * add python 3.11 --- .github/workflows/dev_test_benckmark.yml | 8 ++------ .github/workflows/dev_test_bettertransformer.yml | 6 ++---- .github/workflows/dev_test_dummy_inputs.yml | 4 +--- .github/workflows/dev_test_exporters.yml | 8 ++------ .github/workflows/dev_test_fx.yml | 4 +--- .github/workflows/dev_test_onnx.yml | 4 +--- .github/workflows/dev_test_onnxruntime.yml | 4 +--- .github/workflows/dev_test_optimum_common.yml | 5 +---- .github/workflows/test_export_onnx.yml | 2 +- .github/workflows/test_export_tflite.yml | 5 ++--- .github/workflows/test_export_tflite_cli.yml | 5 ++--- .../test_export_tflite_cli_dynamic_quantization_int8.yml | 5 ++--- .../test_export_tflite_cli_quantization_fp16.yml | 5 ++--- .../test_export_tflite_cli_quantization_full_int8.yml | 5 ++--- ...export_tflite_cli_quantization_int8_custom_dataset.yml | 5 ++--- ...xport_tflite_cli_quantization_int8_default_dataset.yml | 5 ++--- .../test_export_tflite_cli_quantization_int8x16.yml | 5 ++--- .github/workflows/test_exporters_common.yml | 5 ++--- .github/workflows/test_exporters_slow.yml | 5 ++--- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_offline.yml | 5 ++--- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 2 +- .github/workflows/test_onnxruntime_slow.yml | 2 +- .github/workflows/test_optimum_common.yml | 4 ++-- .github/workflows/test_utils.yml | 2 +- setup.py | 7 ++++--- 27 files changed, 45 insertions(+), 76 deletions(-) diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml index 5f6fc825021..a898d288625 100644 --- a/.github/workflows/dev_test_benckmark.yml +++ b/.github/workflows/dev_test_benckmark.yml @@ -12,12 +12,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 - os: - - ubuntu-20.04 - runs-on: ${{ matrix.os }} + python-version: ['3.9', '3.11'] + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml index e4c999ca6da..e75b5e3bf98 100644 --- a/.github/workflows/dev_test_bettertransformer.yml +++ b/.github/workflows/dev_test_bettertransformer.yml @@ -12,18 +12,16 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 os: - ubuntu-20.04 - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} + - name: Setup Python uses: actions/setup-python@v2 with: - python-version: ${{ matrix.python-version }} + python-version: '3.9' - name: Install dependencies run: | pip install .[tests] diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml index 49baa49c418..72a4763e432 100644 --- a/.github/workflows/dev_test_dummy_inputs.yml +++ b/.github/workflows/dev_test_dummy_inputs.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_exporters.yml b/.github/workflows/dev_test_exporters.yml index 5d967d125f5..b2dee3ed3a9 100644 --- a/.github/workflows/dev_test_exporters.yml +++ b/.github/workflows/dev_test_exporters.yml @@ -12,12 +12,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 - os: - - ubuntu-20.04 - runs-on: ${{ matrix.os }} + python-version: ['3.9', '3.11'] + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml index 0b8633282f7..a0c54c78365 100644 --- a/.github/workflows/dev_test_fx.yml +++ b/.github/workflows/dev_test_fx.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml index 48052cfded3..f7514e1c5e5 100644 --- a/.github/workflows/dev_test_onnx.yml +++ b/.github/workflows/dev_test_onnx.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml index 857028ab2db..c9104ebbd6c 100644 --- a/.github/workflows/dev_test_onnxruntime.yml +++ b/.github/workflows/dev_test_onnxruntime.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - windows-2019 diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml index 807ed0b1dab..117db50437b 100644 --- a/.github/workflows/dev_test_optimum_common.yml +++ b/.github/workflows/dev_test_optimum_common.yml @@ -12,10 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.7 - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - windows-2019 diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml index 0cd19a1724c..d1fd4a9723f 100644 --- a/.github/workflows/test_export_onnx.yml +++ b/.github/workflows/test_export_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_export_tflite.yml b/.github/workflows/test_export_tflite.yml index 362390b166d..225a28c1cba 100644 --- a/.github/workflows/test_export_tflite.yml +++ b/.github/workflows/test_export_tflite.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli.yml b/.github/workflows/test_export_tflite_cli.yml index e14e4cde325..cfca58cf9c1 100644 --- a/.github/workflows/test_export_tflite_cli.yml +++ b/.github/workflows/test_export_tflite_cli.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml index 7e4a83b3b7b..9cebe8ac0f6 100644 --- a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml +++ b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml index 981dd005e52..ca35ad8b3eb 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml index 9064bfaf315..1531ffa5c9c 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml index 824e8933a08..7274d09c0f8 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml index e975997e379..6c8639ebfe0 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml index ef59cff0b92..39902d0dd50 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml index 11f6038afe4..801e0bebc55 100644 --- a/.github/workflows/test_exporters_common.yml +++ b/.github/workflows/test_exporters_common.yml @@ -15,10 +15,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml index 453389d63fa..b5f142fc7dc 100644 --- a/.github/workflows/test_exporters_slow.yml +++ b/.github/workflows/test_exporters_slow.yml @@ -14,10 +14,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index a4e6dd3cd29..0a1890cc715 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml index 20911fe6db8..29b7b183bd7 100644 --- a/.github/workflows/test_offline.yml +++ b/.github/workflows/test_offline.yml @@ -15,10 +15,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index dd1f3bee63d..418a9e42c1a 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 0ab95752d01..089300f7cd9 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -39,7 +39,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: '3.9' - name: Install dependencies run: | diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index c5679e5b307..89d44e57ad1 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index 5ad42807a5f..9aab45e4b71 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} @@ -36,5 +36,5 @@ jobs: shell: bash run: | # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. - export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} + export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.9' && matrix.os == 'ubuntu-20.04' }} pytest tests/test_*.py diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index b5f2e27fc6a..0126b023c60 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-13] - python-version: [3.9] + python-version: ['3.9'] runs-on: ${{ matrix.os }} steps: diff --git a/setup.py b/setup.py index 82892bfcc8c..7ea0da56c29 100644 --- a/setup.py +++ b/setup.py @@ -123,9 +123,10 @@ "Intended Audience :: Education", "Intended Audience :: Science/Research", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], keywords="transformers, quantization, pruning, optimization, training, inference, onnx, onnx runtime, intel, " @@ -137,7 +138,7 @@ packages=find_namespace_packages(include=["optimum*"]), install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, - python_requires=">=3.7.0", + python_requires=">=3.9.0", include_package_data=True, zip_safe=False, entry_points={"console_scripts": ["optimum-cli=optimum.commands.optimum_cli:main"]}, From e8b03321035ea19001bcbb773444e3f0574d4150 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 15 Nov 2024 17:15:33 +0100 Subject: [PATCH 35/50] Update Dockerfile base image (#2089) upgrade base image --- docs/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Dockerfile b/docs/Dockerfile index 29ea0f916ce..d76dc50c556 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM nikolaik/python-nodejs:python3.8-nodejs18 +FROM nikolaik/python-nodejs:python3.9-nodejs18 ARG commit_sha ARG clone_url From c513437511e51ccedb4f28c30e6aea9c0cf76a4a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:47:29 +0100 Subject: [PATCH 36/50] Add transformers 4.36 tests (#2085) * add transformers 4.36 tests * add test depending on tranformers version * add min transformers required version for gemma * update macos * fix whisper test * add opt * fix mpt * add comment * add granite testwhen supported by transformers --- .github/workflows/test_onnxruntime.yml | 4 ++- optimum/exporters/onnx/model_configs.py | 4 ++- setup.py | 10 +++---- tests/onnxruntime/test_modeling.py | 37 +++++++++++++++---------- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 089300f7cd9..fec5c7e5b27 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -18,8 +18,10 @@ jobs: fail-fast: false matrix: transformers-version: ["latest"] - os: [ubuntu-20.04, windows-2019, macos-13] + os: [ubuntu-20.04, windows-2019, macos-15] include: + - transformers-version: "4.36.*" + os: ubuntu-20.04 - transformers-version: "4.45.*" os: ubuntu-20.04 diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index cc752779d30..6b92109b7b6 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -295,7 +295,7 @@ class Qwen2OnnxConfig(LlamaOnnxConfig): class GemmaOnnxConfig(LlamaOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator - pass + MIN_TRANSFORMERS_VERSION = version.parse("4.38.0") class GraniteOnnxConfig(LlamaOnnxConfig): @@ -348,6 +348,8 @@ def patch_model_for_export( class MPTOnnxConfig(TextDecoderOnnxConfig): # MPT does not require position_ids input. DEFAULT_ONNX_OPSET = 13 + # TODO: fix inference for transformers < v4.41 for beam_search > 1 + MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers" ) diff --git a/setup.py b/setup.py index 7ea0da56c29..29f97b604e0 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,19 +63,19 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters": [ "onnx", "onnxruntime", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-gpu": [ "onnx", "onnxruntime-gpu", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", @@ -86,7 +86,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers>=4.26,<4.38", + "transformers>=4.36,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index a335e014478..84ac27029f9 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2318,21 +2318,28 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "bloom", "codegen", "falcon", - "gemma", "gpt2", "gpt_bigcode", "gpt_neo", "gpt_neox", "gptj", - "granite", "llama", "mistral", - "mpt", "opt", ] - if check_if_transformers_greater("4.40"): - SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + if check_if_transformers_greater("4.37"): + SUPPORTED_ARCHITECTURES.append("qwen2") + + if check_if_transformers_greater("4.38"): + SUPPORTED_ARCHITECTURES.append("gemma") + + # TODO: fix "mpt" for which inference fails for transformers < v4.41 + if check_if_transformers_greater("4.41"): + SUPPORTED_ARCHITECTURES.extend(["phi3", "mpt"]) + + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES.append("granite") FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, @@ -2445,7 +2452,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach transformers_model = AutoModelForCausalLM.from_pretrained(model_id) transformers_model = transformers_model.eval() tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens = tokenizer("This is a sample input", return_tensors="pt") position_ids = None if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: input_shape = tokens["input_ids"].shape @@ -2467,7 +2474,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # Compare batched generation. tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "left" - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["This is", "This is a sample input"], return_tensors="pt", padding=True) onnx_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None onnx_model.config.eos_token_id = None @@ -4598,14 +4605,14 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual( - outputs_model_with_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) - self.assertEqual( - outputs_model_without_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) + + if model_arch == "whisper" and check_if_transformers_greater("4.43"): + gen_length = self.GENERATION_LENGTH + 2 + else: + gen_length = self.GENERATION_LENGTH + 1 + + self.assertEqual(outputs_model_with_pkv.shape[1], gen_length) + self.assertEqual(outputs_model_without_pkv.shape[1], gen_length) self.GENERATION_LENGTH = generation_length if os.environ.get("TEST_LEVEL", 0) == "1": From 400bb82f312016b0a31b342d48b00d031786417d Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:05:37 +0100 Subject: [PATCH 37/50] [`fix`] Allow ORTQuantizer over models with subfolder ONNX files (#2094) * Allow ORTQuantizer over models with subfolder ONNX files * Also catch ValueError as that seems a common fail when AutoConfig.from_pretrained("does/not/exist") * Use test case that previously failed --- optimum/onnxruntime/quantization.py | 9 +++++---- tests/onnxruntime/test_quantization.py | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 056123f8d8e..f637916dcd2 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -100,7 +100,7 @@ def __init__(self, onnx_model_path: Path, config: Optional["PretrainedConfig"] = if self.config is None: try: self.config = AutoConfig.from_pretrained(self.onnx_model_path.parent) - except OSError: + except (OSError, ValueError): LOGGER.warning( f"Could not load the config for {self.onnx_model_path} automatically, this might make " "the quantized model harder to use because it will not be able to be loaded by an ORTModel without " @@ -134,6 +134,7 @@ def from_pretrained( model_or_path = Path(model_or_path) path = None + config = None if isinstance(model_or_path, ORTModelForConditionalGeneration): raise NotImplementedError(ort_quantizer_error_message) elif isinstance(model_or_path, Path) and file_name is None: @@ -147,13 +148,13 @@ def from_pretrained( file_name = onnx_files[0].name if isinstance(model_or_path, ORTModel): - if path is None: - path = Path(model_or_path.model._model_path) + path = Path(model_or_path.model._model_path) + config = model_or_path.config elif os.path.isdir(model_or_path): path = Path(model_or_path) / file_name else: raise ValueError(f"Unable to load model from {model_or_path}.") - return cls(path) + return cls(path, config=config) def fit( self, diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py index b6f1ebb70f6..34a9504f95a 100644 --- a/tests/onnxruntime/test_quantization.py +++ b/tests/onnxruntime/test_quantization.py @@ -30,6 +30,7 @@ AutoQuantizationConfig, ORTConfig, ORTModelForCausalLM, + ORTModelForFeatureExtraction, ORTModelForSeq2SeqLM, ORTModelForSequenceClassification, ORTQuantizer, @@ -52,6 +53,13 @@ class ORTQuantizerTest(unittest.TestCase): "optimum/distilbert-base-uncased-finetuned-sst-2-english" ) }, + "ort_model_with_onnx_model_in_subfolder": { + "model_or_path": ORTModelForFeatureExtraction.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2", + subfolder="onnx", + file_name="model.onnx", + ) + }, } @parameterized.expand(LOAD_CONFIGURATION.items()) From a7a807c9e712fd9669865358e34c1de072b78d8e Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:10:57 +0100 Subject: [PATCH 38/50] SD3 and Flux support (#2073) * sd3 support * unsupported cli model types * flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests * fixes * move input generators * dummy diffusers * style * sd3 support * unsupported cli model types * flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests * fixes * move input generators * dummy diffusers * style * distribute ort tests * fix * fix * fix * test num images * single process to reduce re-exports * test * revert unnecessary changes * T5Encoder inherits from TextEncoder * style * fix typo in timestep * style * only test sd3 and flux on latest transformers * conditional sd3 and flux modeling * forgot sd3 inpaint --- .github/workflows/test_onnxruntime.yml | 13 +- optimum/exporters/onnx/base.py | 1 + optimum/exporters/onnx/convert.py | 4 + optimum/exporters/onnx/model_configs.py | 123 +++++++++-- optimum/exporters/tasks.py | 29 ++- optimum/exporters/utils.py | 190 +++++++++++------ optimum/onnxruntime/__init__.py | 72 +++++-- optimum/onnxruntime/modeling_diffusion.py | 202 +++++++++++++++++-- optimum/utils/__init__.py | 7 + optimum/utils/constant.py | 4 +- optimum/utils/dummy_diffusers_objects.py | 74 ++++++- optimum/utils/input_generators.py | 81 +++++++- tests/exporters/exporters_utils.py | 4 +- tests/exporters/onnx/test_onnx_export.py | 2 - tests/onnxruntime/test_diffusion.py | 192 +++++++++++------- tests/onnxruntime/test_modeling.py | 2 +- tests/onnxruntime/test_quantization.py | 4 +- tests/onnxruntime/utils_onnxruntime_tests.py | 4 +- 18 files changed, 791 insertions(+), 217 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index fec5c7e5b27..b20a3b46f88 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -26,14 +26,11 @@ jobs: os: ubuntu-20.04 runs-on: ${{ matrix.os }} + steps: - name: Free Disk Space (Ubuntu) if: matrix.os == 'ubuntu-20.04' uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - swap-storage: false - large-packages: false - name: Checkout code uses: actions/checkout@v4 @@ -54,13 +51,11 @@ jobs: run: pip install transformers==${{ matrix.transformers-version }} - name: Test with pytest (in series) - working-directory: tests run: | - pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s + pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s - name: Test with pytest (in parallel) + run: | + pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto env: HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - working-directory: tests - run: | - pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py index 8cd94194ffe..7e35691d54b 100644 --- a/optimum/exporters/onnx/base.py +++ b/optimum/exporters/onnx/base.py @@ -319,6 +319,7 @@ def fix_dynamic_axes( input_shapes = {} dummy_inputs = self.generate_dummy_inputs(framework="np", **input_shapes) dummy_inputs = self.generate_dummy_inputs_for_validation(dummy_inputs, onnx_input_names=onnx_input_names) + dummy_inputs = self.rename_ambiguous_inputs(dummy_inputs) onnx_inputs = {} for name, value in dummy_inputs.items(): diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 2661d835979..c12a9ac222a 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -1183,6 +1183,10 @@ def onnx_export_from_model( if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + tokenizer_3 = getattr(model, "tokenizer_3", None) + if tokenizer_3 is not None: + tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) + model.save_config(output) if float_dtype == "bf16": diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 6b92109b7b6..8984162ee8c 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Model specific ONNX configurations.""" + import random from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union @@ -28,6 +29,8 @@ DummyCodegenDecoderTextInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, + DummyFluxTransformerTextInputGenerator, + DummyFluxTransformerVisionInputGenerator, DummyInputGenerator, DummyIntGenerator, DummyPastKeyValuesGenerator, @@ -38,6 +41,9 @@ DummySpeechT5InputGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, + DummyTransformerTextInputGenerator, + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, DummyVisionEmbeddingsGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator, DummyVisionInputGenerator, @@ -53,6 +59,7 @@ NormalizedTextConfig, NormalizedTextConfigWithGQA, NormalizedVisionConfig, + check_if_diffusers_greater, check_if_transformers_greater, is_diffusers_available, logging, @@ -1039,22 +1046,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]: "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, "pooler_output": {0: "batch_size"}, } + if self._normalized_config.output_hidden_states: for i in range(self._normalized_config.num_layers + 1): common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"} return common_outputs - def generate_dummy_inputs(self, framework: str = "pt", **kwargs): - dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) - - # TODO: fix should be by casting inputs during inference and not export - if framework == "pt": - import torch - - dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32) - return dummy_inputs - def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], @@ -1064,7 +1062,7 @@ def patch_model_for_export( class UNetOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-3 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1087,17 +1085,19 @@ class UNetOnnxConfig(VisionOnnxConfig): def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = { "sample": {0: "batch_size", 2: "height", 3: "width"}, - "timestep": {0: "steps"}, + "timestep": {}, # a scalar with no dimension "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, } - # TODO : add text_image, image and image_embeds + # TODO : add addition_embed_type == text_image, image and image_embeds + # https://github.com/huggingface/diffusers/blob/9366c8f84bfe47099ff047272661786ebb54721d/src/diffusers/models/unets/unet_2d_condition.py#L671 if getattr(self._normalized_config, "addition_embed_type", None) == "text_time": common_inputs["text_embeds"] = {0: "batch_size"} common_inputs["time_ids"] = {0: "batch_size"} if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None: common_inputs["timestep_cond"] = {0: "batch_size"} + return common_inputs @property @@ -1136,7 +1136,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-4 + ATOL_FOR_VALIDATION = 3e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1184,6 +1184,101 @@ def outputs(self) -> Dict[str, Dict[int, str]]: } +class T5EncoderOnnxConfig(TextEncoderOnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 12 # int64 was supported since opset 12 + + @property + def inputs(self): + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self): + return { + "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, + } + + +class SD3TransformerOnnxConfig(VisionOnnxConfig): + ATOL_FOR_VALIDATION = 1e-4 + # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu + # operator support, available since opset 14 + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, + DummyTransformerTextInputGenerator, + ) + + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + vocab_size="attention_head_dim", + hidden_size="joint_attention_dim", + projection_size="pooled_projection_dim", + allow_new=True, + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = { + "hidden_states": {0: "batch_size", 2: "height", 3: "width"}, + "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, + "pooled_projections": {0: "batch_size"}, + "timestep": {0: "step"}, + } + + return common_inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "out_hidden_states": {0: "batch_size", 2: "height", 3: "width"}, + } + + @property + def torch_to_onnx_output_map(self) -> Dict[str, str]: + return { + "sample": "out_hidden_states", + } + + +class FluxTransformerOnnxConfig(SD3TransformerOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestepInputGenerator, + DummyFluxTransformerVisionInputGenerator, + DummyFluxTransformerTextInputGenerator, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = ( + {0: "sequence_length"} if check_if_diffusers_greater("0.31.0") else {0: "batch_size", 1: "sequence_length"} + ) + common_inputs["img_ids"] = ( + {0: "packed_height_width"} + if check_if_diffusers_greater("0.31.0") + else {0: "batch_size", 1: "packed_height_width"} + ) + + if getattr(self._normalized_config, "guidance_embeds", False): + common_inputs["guidance"] = {0: "batch_size"} + + return common_inputs + + @property + def outputs(self): + return { + "out_hidden_states": {0: "batch_size", 1: "packed_height_width"}, + } + + class GroupViTOnnxConfig(CLIPOnnxConfig): pass diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index fdc8bfcb539..b4bce4696f3 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -335,7 +335,11 @@ class TasksManager: } _DIFFUSERS_SUPPORTED_MODEL_TYPE = { - "clip-text-model": supported_tasks_mapping( + "t5-encoder": supported_tasks_mapping( + "feature-extraction", + onnx="T5EncoderOnnxConfig", + ), + "clip-text": supported_tasks_mapping( "feature-extraction", onnx="CLIPTextOnnxConfig", ), @@ -343,7 +347,15 @@ class TasksManager: "feature-extraction", onnx="CLIPTextWithProjectionOnnxConfig", ), - "unet": supported_tasks_mapping( + "flux-transformer-2d": supported_tasks_mapping( + "semantic-segmentation", + onnx="FluxTransformerOnnxConfig", + ), + "sd3-transformer-2d": supported_tasks_mapping( + "semantic-segmentation", + onnx="SD3TransformerOnnxConfig", + ), + "unet-2d-condition": supported_tasks_mapping( "semantic-segmentation", onnx="UNetOnnxConfig", ), @@ -1177,12 +1189,17 @@ class TasksManager: "transformers": _SUPPORTED_MODEL_TYPE, } _UNSUPPORTED_CLI_MODEL_TYPE = { - "unet", + # diffusers model types + "clip-text", + "clip-text-with-projection", + "flux-transformer-2d", + "sd3-transformer-2d", + "t5-encoder", + "unet-2d-condition", "vae-encoder", "vae-decoder", - "clip-text-model", - "clip-text-with-projection", - "trocr", # supported through the vision-encoder-decoder model type + # redundant model types + "trocr", # same as vision-encoder-decoder } _SUPPORTED_CLI_MODEL_TYPE = ( set(_SUPPORTED_MODEL_TYPE.keys()) diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index 949b54f4685..60de169de5e 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -15,7 +15,6 @@ """Utilities for model preparation to export.""" - import copy from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union @@ -44,12 +43,7 @@ "Please update diffusers by running `pip install --upgrade diffusers`" ) - from diffusers import ( - DiffusionPipeline, - StableDiffusionXLImg2ImgPipeline, - StableDiffusionXLInpaintPipeline, - StableDiffusionXLPipeline, - ) + from diffusers import DiffusionPipeline from diffusers.models.attention_processor import ( Attention, AttnAddedKVProcessor, @@ -80,6 +74,20 @@ DECODER_MERGED_NAME = "decoder_model_merged" +_DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE = { + "CLIPTextModel": "clip-text", + "CLIPTextModelWithProjection": "clip-text-with-projection", + "FluxTransformer2DModel": "flux-transformer-2d", + "SD3Transformer2DModel": "sd3-transformer-2d", + "UNet2DConditionModel": "unet-2d-condition", + "T5EncoderModel": "t5-encoder", +} + + +def _get_diffusers_submodel_type(submodel): + return _DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE.get(submodel.__class__.__name__) + + def _get_submodels_for_export_diffusion( pipeline: "DiffusionPipeline", ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]: @@ -87,56 +95,87 @@ def _get_submodels_for_export_diffusion( Returns the components of a Stable Diffusion model. """ - is_stable_diffusion_xl = isinstance( - pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline) - ) - if is_stable_diffusion_xl: - projection_dim = pipeline.text_encoder_2.config.projection_dim - else: - projection_dim = pipeline.text_encoder.config.projection_dim - models_for_export = {} + is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0") + is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") + is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") + # Text encoder text_encoder = getattr(pipeline, "text_encoder", None) if text_encoder is not None: - if is_stable_diffusion_xl: + if is_sdxl or is_sd3: text_encoder.config.output_hidden_states = True + text_encoder.text_model.config.output_hidden_states = True + + text_encoder.config.export_model_type = _get_diffusers_submodel_type(text_encoder) models_for_export["text_encoder"] = text_encoder - # U-NET - # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 - is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0") - if not is_torch_greater_or_equal_than_2_1: - pipeline.unet.set_attn_processor(AttnProcessor()) + # Text encoder 2 + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + if is_sdxl or is_sd3: + text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True - pipeline.unet.config.text_encoder_projection_dim = projection_dim - # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` - # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 - pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) - models_for_export["unet"] = pipeline.unet + text_encoder_2.config.export_model_type = _get_diffusers_submodel_type(text_encoder_2) + models_for_export["text_encoder_2"] = text_encoder_2 - # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + # Text encoder 3 + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) + if text_encoder_3 is not None: + text_encoder_3.config.export_model_type = _get_diffusers_submodel_type(text_encoder_3) + models_for_export["text_encoder_3"] = text_encoder_3 + + # U-NET + unet = getattr(pipeline, "unet", None) + if unet is not None: + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 + if not is_torch_greater_or_equal_than_2_1: + unet.set_attn_processor(AttnProcessor()) + + # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` + # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 + unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None) + unet.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim + unet.config.export_model_type = _get_diffusers_submodel_type(unet) + models_for_export["unet"] = unet + + # Transformer + transformer = getattr(pipeline, "transformer", None) + if transformer is not None: + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 + if not is_torch_greater_or_equal_than_2_1: + transformer.set_attn_processor(AttnProcessor()) + + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = getattr(pipeline.transformer.config, "time_cond_proj_dim", None) + transformer.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim + transformer.config.export_model_type = _get_diffusers_submodel_type(transformer) + models_for_export["transformer"] = transformer + + # VAE Encoder vae_encoder = copy.deepcopy(pipeline.vae) + + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 if not is_torch_greater_or_equal_than_2_1: vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder) + # we return the distribution parameters to be able to recreate it in the decoder vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} models_for_export["vae_encoder"] = vae_encoder - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + # VAE Decoder vae_decoder = copy.deepcopy(pipeline.vae) + + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 if not is_torch_greater_or_equal_than_2_1: vae_decoder = override_diffusers_2_0_attn_processors(vae_decoder) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) models_for_export["vae_decoder"] = vae_decoder - text_encoder_2 = getattr(pipeline, "text_encoder_2", None) - if text_encoder_2 is not None: - text_encoder_2.config.output_hidden_states = True - text_encoder_2.text_model.config.output_hidden_states = True - models_for_export["text_encoder_2"] = text_encoder_2 - return models_for_export @@ -294,33 +333,59 @@ def get_diffusion_models_for_export( `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `ExportConfig`]: A Dict containing the model and export configs for the different components of the model. """ + models_for_export = _get_submodels_for_export_diffusion(pipeline) # Text encoder if "text_encoder" in models_for_export: + text_encoder = models_for_export["text_encoder"] text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.text_encoder, - exporter=exporter, - library_name="diffusers", - task="feature-extraction", + model=text_encoder, exporter=exporter, library_name="diffusers", task="feature-extraction" ) text_encoder_export_config = text_encoder_config_constructor( - pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype ) models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config) + # Text encoder 2 + if "text_encoder_2" in models_for_export: + text_encoder_2 = models_for_export["text_encoder_2"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, exporter=exporter, library_name="diffusers", task="feature-extraction" + ) + export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) + + # Text encoder 3 + if "text_encoder_3" in models_for_export: + text_encoder_3 = models_for_export["text_encoder_3"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_3, exporter=exporter, library_name="diffusers", task="feature-extraction" + ) + export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_3"] = (models_for_export["text_encoder_3"], export_config) + # U-NET - export_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.unet, - exporter=exporter, - library_name="diffusers", - task="semantic-segmentation", - model_type="unet", - ) - unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["unet"] = (models_for_export["unet"], unet_export_config) + if "unet" in models_for_export: + unet = models_for_export["unet"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=unet, exporter=exporter, library_name="diffusers", task="semantic-segmentation" + ) + unet_export_config = export_config_constructor(unet.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["unet"] = (models_for_export["unet"], unet_export_config) - # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + # Transformer + if "transformer" in models_for_export: + transformer = models_for_export["transformer"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, exporter=exporter, library_name="diffusers", task="semantic-segmentation" + ) + transformer_export_config = export_config_constructor( + transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (models_for_export["transformer"], transformer_export_config) + + # VAE Encoder vae_encoder = models_for_export["vae_encoder"] vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_encoder, @@ -329,10 +394,12 @@ def get_diffusion_models_for_export( task="semantic-segmentation", model_type="vae-encoder", ) - vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["vae_encoder"] = (vae_encoder, vae_export_config) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + # VAE Decoder vae_decoder = models_for_export["vae_decoder"] vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_decoder, @@ -341,21 +408,10 @@ def get_diffusion_models_for_export( task="semantic-segmentation", model_type="vae-decoder", ) - vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["vae_decoder"] = (vae_decoder, vae_export_config) - - if "text_encoder_2" in models_for_export: - export_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.text_encoder_2, - exporter=exporter, - library_name="diffusers", - task="feature-extraction", - model_type="clip-text-with-projection", - ) - export_config = export_config_constructor( - pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype - ) - models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) return models_for_export diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 4e25a436909..f3f1535fd45 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -74,33 +74,51 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: _import_structure[".utils.dummy_diffusers_objects"] = [ - "ORTStableDiffusionPipeline", + "ORTDiffusionPipeline", + "ORTPipelineForText2Image", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + # flux + "ORTFluxPipeline", + # lcm + "ORTLatentConsistencyModelImg2ImgPipeline", + "ORTLatentConsistencyModelPipeline", + # sd3 + "ORTStableDiffusion3Img2ImgPipeline", + "ORTStableDiffusion3InpaintPipeline", + "ORTStableDiffusion3Pipeline", + # sd "ORTStableDiffusionImg2ImgPipeline", "ORTStableDiffusionInpaintPipeline", - "ORTStableDiffusionXLPipeline", + "ORTStableDiffusionPipeline", + # xl "ORTStableDiffusionXLImg2ImgPipeline", "ORTStableDiffusionXLInpaintPipeline", - "ORTLatentConsistencyModelPipeline", - "ORTLatentConsistencyModelImg2ImgPipeline", - "ORTPipelineForImage2Image", - "ORTPipelineForInpainting", - "ORTPipelineForText2Image", - "ORTDiffusionPipeline", + "ORTStableDiffusionXLPipeline", ] else: _import_structure["modeling_diffusion"] = [ - "ORTStableDiffusionPipeline", + "ORTDiffusionPipeline", + "ORTPipelineForText2Image", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + # flux + "ORTFluxPipeline", + # lcm + "ORTLatentConsistencyModelImg2ImgPipeline", + "ORTLatentConsistencyModelPipeline", + # sd3 + "ORTStableDiffusion3Img2ImgPipeline", + "ORTStableDiffusion3InpaintPipeline", + "ORTStableDiffusion3Pipeline", + # sd "ORTStableDiffusionImg2ImgPipeline", "ORTStableDiffusionInpaintPipeline", - "ORTStableDiffusionXLPipeline", + "ORTStableDiffusionPipeline", + # xl "ORTStableDiffusionXLImg2ImgPipeline", "ORTStableDiffusionXLInpaintPipeline", - "ORTLatentConsistencyModelImg2ImgPipeline", - "ORTLatentConsistencyModelPipeline", - "ORTPipelineForImage2Image", - "ORTPipelineForInpainting", - "ORTPipelineForText2Image", - "ORTDiffusionPipeline", + "ORTStableDiffusionXLPipeline", ] @@ -151,30 +169,52 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( + # generic entrypoint ORTDiffusionPipeline, + # flux + ORTFluxPipeline, + # lcm ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, + # task-specific entrypoints ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, + # sd3 + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTStableDiffusion3Pipeline, + # sd ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, + # xl ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: from .modeling_diffusion import ( + # generic entrypoint ORTDiffusionPipeline, + # flux + ORTFluxPipeline, + # lcm ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, + # task-specific entrypoints ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, + # sd3 + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTStableDiffusion3Pipeline, + # sd ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, + # xl ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 3899a7b36b6..79d302be449 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -57,7 +57,9 @@ from ..onnx.utils import _get_model_external_data_paths from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER, DIFFUSION_MODEL_UNET_SUBFOLDER, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, @@ -76,7 +78,7 @@ if check_if_diffusers_greater("0.25.0"): from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution else: - from diffusers.models.vae import DiagonalGaussianDistribution + from diffusers.models.vae import DiagonalGaussianDistribution # type: ignore logger = logging.getLogger(__name__) @@ -92,15 +94,18 @@ class ORTDiffusionPipeline(ORTModel, DiffusionPipeline): def __init__( self, scheduler: "SchedulerMixin", - unet_session: ort.InferenceSession, vae_decoder_session: ort.InferenceSession, # optional pipeline models + unet_session: Optional[ort.InferenceSession] = None, + transformer_session: Optional[ort.InferenceSession] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, + text_encoder_3_session: Optional[ort.InferenceSession] = None, # optional pipeline submodels tokenizer: Optional["CLIPTokenizer"] = None, tokenizer_2: Optional["CLIPTokenizer"] = None, + tokenizer_3: Optional["CLIPTokenizer"] = None, feature_extractor: Optional["CLIPFeatureExtractor"] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, @@ -111,16 +116,20 @@ def __init__( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - self.unet = ORTModelUnet(unet_session, self) - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.unet = ORTModelUnet(unet_session, self) if unet_session is not None else None + self.transformer = ORTModelTransformer(transformer_session, self) if transformer_session is not None else None self.text_encoder = ( ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None ) self.text_encoder_2 = ( ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None ) + self.text_encoder_3 = ( + ORTModelTextEncoder(text_encoder_3_session, self) if text_encoder_3_session is not None else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) if vae_decoder_session is not None else None self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) # we allow passing these as torch models for now @@ -130,18 +139,22 @@ def __init__( self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 + self.tokenizer_3 = tokenizer_3 self.feature_extractor = feature_extractor all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, + "tokenizer_3": self.tokenizer_3, "feature_extractor": self.feature_extractor, "requires_aesthetics_score": requires_aesthetics_score, "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, @@ -157,7 +170,10 @@ def __init__( # inits ort specific attributes self.shared_attributes_init( - model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs + model=unet_session if unet_session is not None else transformer_session, + use_io_binding=use_io_binding, + model_save_dir=model_save_dir, + **kwargs, ) def _save_pretrained(self, save_directory: Union[str, Path]): @@ -165,10 +181,12 @@ def _save_pretrained(self, save_directory: Union[str, Path]): models_to_save_paths = { (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER), (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), + (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER), } for model, save_path in models_to_save_paths: if model is not None: @@ -192,6 +210,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.tokenizer_3 is not None: + self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @@ -208,10 +228,12 @@ def _from_pretrained( cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, + transformer_file_name: str = ONNX_WEIGHTS_NAME, vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, + text_encoder_3_file_name: str = ONNX_WEIGHTS_NAME, use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", provider_options: Optional[Dict[str, Any]] = None, @@ -230,10 +252,12 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, + transformer_file_name, vae_decoder_file_name, vae_encoder_file_name, text_encoder_file_name, text_encoder_2_file_name, + text_encoder_3_file_name, SCHEDULER_CONFIG_NAME, cls.config_name, CONFIG_NAME, @@ -259,10 +283,12 @@ def _from_pretrained( model_paths = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name, } sessions = {} @@ -276,7 +302,7 @@ def _from_pretrained( ) submodels = {} - for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: + for submodel in {"scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"}: if kwargs.get(submodel, None) is not None: submodels[submodel] = kwargs.pop(submodel) elif config.get(submodel, (None, None))[0] is not None: @@ -385,17 +411,24 @@ def to(self, device: Union[torch.device, str, int]): if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self - self.unet.session.set_providers([provider], provider_options=[provider_options]) self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) + if self.unet is not None: + self.unet.session.set_providers([provider], provider_options=[provider_options]) + if self.transformer is not None: + self.transformer.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) if self.text_encoder is not None: self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) if self.text_encoder_2 is not None: self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder_3 is not None: + self.text_encoder_3.session.set_providers([provider], provider_options=[provider_options]) - self.providers = self.unet.session.get_providers() + self.providers = ( + self.unet.session.get_providers() if self.unet is not None else self.transformer.session.get_providers() + ) self._device = device return self @@ -412,8 +445,10 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, } @@ -443,9 +478,13 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionP self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} + self.input_shapes = {input_key.name: input_key.shape for input_key in self.session.get_inputs()} + self.output_shapes = {output_key.name: output_key.shape for output_key in self.session.get_outputs()} + config_file_path = Path(session._model_path).parent / self.config_name if not config_file_path.is_file(): # config is mandatory for the model part to be used for inference @@ -543,13 +582,18 @@ def __init__(self, *args, **kwargs): ) self.register_to_config(time_cond_proj_dim=None) + if len(self.input_shapes["timestep"]) > 0: + logger.warning( + "The exported unet onnx model expects a non scalar timestep input. " + "We will have to unsqueeze the timestep input at each iteration which might be inefficient. " + "Please re-export the pipeline with newer version of optimum and diffusers to avoid this warning." + ) + def forward( self, sample: Union[np.ndarray, torch.Tensor], timestep: Union[np.ndarray, torch.Tensor], encoder_hidden_states: Union[np.ndarray, torch.Tensor], - text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, - time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, added_cond_kwargs: Optional[Dict[str, Any]] = None, @@ -557,15 +601,13 @@ def forward( ): use_torch = isinstance(sample, torch.Tensor) - if len(timestep.shape) == 0: + if len(self.input_shapes["timestep"]) > 0: timestep = timestep.unsqueeze(0) model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, - "text_embeds": text_embeds, - "time_ids": time_ids, "timestep_cond": timestep_cond, **(cross_attention_kwargs or {}), **(added_cond_kwargs or {}), @@ -581,6 +623,42 @@ def forward( return ModelOutput(**model_outputs) +class ORTModelTransformer(ORTPipelinePart): + def forward( + self, + hidden_states: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + pooled_projections: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + guidance: Optional[Union[np.ndarray, torch.Tensor]] = None, + txt_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + img_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, + ): + use_torch = isinstance(hidden_states, torch.Tensor) + + model_inputs = { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "timestep": timestep, + "guidance": guidance, + "txt_ids": txt_ids, + "img_ids": img_ids, + **(joint_attention_kwargs or {}), + } + + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class ORTModelTextEncoder(ORTPipelinePart): def forward( self, @@ -599,11 +677,13 @@ def forward( if output_hidden_states: model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): + num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers + for i in range(num_layers): model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) else: - for i in range(self.config.num_hidden_layers): + num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers + for i in range(num_layers): model_outputs.pop(f"hidden_states.{i}", None) if return_dict: @@ -620,7 +700,7 @@ def __init__(self, *args, **kwargs): if not hasattr(self.config, "scaling_factor"): logger.warning( "The `scaling_factor` attribute is missing from the VAE encoder configuration. " - "Please re-export the model with newer version of optimum and diffusers." + "Please re-export the model with newer version of optimum and diffusers to avoid this warning." ) self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) @@ -660,7 +740,7 @@ def __init__(self, *args, **kwargs): if not hasattr(self.config, "scaling_factor"): logger.warning( "The `scaling_factor` attribute is missing from the VAE decoder configuration. " - "Please re-export the model with newer version of optimum and diffusers." + "Please re-export the model with newer version of optimum and diffusers to avoid this warning." ) self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) @@ -871,6 +951,80 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi auto_model_class = LatentConsistencyModelImg2ImgPipeline +class ORTUnavailablePipeline: + MIN_VERSION = None + + def __init__(self, *args, **kwargs): + raise NotImplementedError( + f"The pipeline {self.__class__.__name__} is not available in the current version of `diffusers`. " + f"Please upgrade `diffusers` to {self.MIN_VERSION} or later." + ) + + +if check_if_diffusers_greater("0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3Pipeline(ORTDiffusionPipeline, StableDiffusion3Pipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Pipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusion3Pipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Pipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3Img2ImgPipeline(ORTDiffusionPipeline, StableDiffusion3Img2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Img2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusion3Img2ImgPipeline). + """ + + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = StableDiffusion3Img2ImgPipeline + +else: + + class ORTStableDiffusion3Pipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.29.0" + + class ORTStableDiffusion3Img2ImgPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.29.0" + + +if check_if_diffusers_greater("0.30.0"): + from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3InpaintPipeline(ORTDiffusionPipeline, StableDiffusion3InpaintPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3InpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusion3InpaintPipeline). + """ + + main_input_name = "prompt" + export_feature = "inpainting" + auto_model_class = StableDiffusion3InpaintPipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTFluxPipeline(ORTDiffusionPipeline, FluxPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.FluxPipeline](https://huggingface.co/docs/diffusers/api/pipelines/flux/text2img#diffusers.FluxPipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = FluxPipeline + +else: + + class ORTStableDiffusion3InpaintPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.30.0" + + class ORTFluxPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.30.0" + + SUPPORTED_ORT_PIPELINES = [ ORTStableDiffusionPipeline, ORTStableDiffusionImg2ImgPipeline, @@ -880,6 +1034,10 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi ORTStableDiffusionXLInpaintPipeline, ORTLatentConsistencyModelPipeline, ORTLatentConsistencyModelImg2ImgPipeline, + ORTStableDiffusion3Pipeline, + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTFluxPipeline, ] @@ -897,23 +1055,27 @@ def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tr ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ + ("flux", ORTFluxPipeline), + ("latent-consistency", ORTLatentConsistencyModelPipeline), ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-3", ORTStableDiffusion3Pipeline), ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), - ("latent-consistency", ORTLatentConsistencyModelPipeline), ] ) ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( [ + ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-3", ORTStableDiffusion3Img2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), - ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ] ) ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ("stable-diffusion-3", ORTStableDiffusion3InpaintPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline), ] ) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index db7d1f6975d..40d93d298e4 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -16,7 +16,9 @@ from .constant import ( CONFIG_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER, DIFFUSION_MODEL_UNET_SUBFOLDER, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, @@ -52,6 +54,8 @@ DummyCodegenDecoderTextInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, + DummyFluxTransformerTextInputGenerator, + DummyFluxTransformerVisionInputGenerator, DummyInputGenerator, DummyIntGenerator, DummyLabelsGenerator, @@ -63,6 +67,9 @@ DummySpeechT5InputGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, + DummyTransformerTextInputGenerator, + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, DummyVisionEmbeddingsGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator, DummyVisionInputGenerator, diff --git a/optimum/utils/constant.py b/optimum/utils/constant.py index 4497b5246d4..eb7a67e9ece 100644 --- a/optimum/utils/constant.py +++ b/optimum/utils/constant.py @@ -15,8 +15,10 @@ CONFIG_NAME = "config.json" DIFFUSION_MODEL_UNET_SUBFOLDER = "unet" -DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder" +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder" DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder" +DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder" DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER = "text_encoder_2" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" ONNX_WEIGHTS_NAME = "model.onnx" diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index 35d1ffe9fc7..ff8b587e19f 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -15,6 +15,50 @@ from .import_utils import DummyObject, requires_backends +class ORTDiffusionPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForText2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForImage2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForInpainting(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + class ORTStableDiffusionPipeline(metaclass=DummyObject): _backends = ["diffusers"] @@ -70,6 +114,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) +class ORTStableDiffusionXLInpaintPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + class ORTLatentConsistencyModelPipeline(metaclass=DummyObject): _backends = ["diffusers"] @@ -81,7 +136,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTDiffusionPipeline(metaclass=DummyObject): +class ORTLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -92,7 +147,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForText2Image(metaclass=DummyObject): +class ORTStableDiffusion3Pipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -103,7 +158,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForImage2Image(metaclass=DummyObject): +class ORTStableDiffusion3Img2ImgPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -114,7 +169,18 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForInpainting(metaclass=DummyObject): +class ORTStableDiffusion3InpaintPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTFluxPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index dac14a38114..148072aa0b4 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -22,7 +22,7 @@ import numpy as np from transformers.utils import is_tf_available, is_torch_available -from ..utils import check_if_transformers_greater +from ..utils import check_if_diffusers_greater, check_if_transformers_greater from .normalized_config import ( NormalizedConfig, NormalizedEncoderDecoderConfig, @@ -36,7 +36,7 @@ import torch if is_tf_available(): - import tensorflow as tf + import tensorflow as tf # type: ignore def check_framework_is_available(func): @@ -871,8 +871,8 @@ def __init__( def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "timestep": - shape = [self.batch_size] - return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype) + shape = [] # a scalar with no dimension (it can be int or float depending on the sd architecture) + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) if input_name == "text_embeds": dim = self.text_encoder_projection_dim @@ -1411,3 +1411,76 @@ def generate( float_dtype: str = "fp32", ): return self.random_int_tensor(shape=(1,), min_value=20, max_value=22, framework=framework, dtype=int_dtype) + + +class DummyTransformerTimestepInputGenerator(DummyTimestepInputGenerator): + SUPPORTED_INPUT_NAMES = ("timestep",) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "timestep": + shape = [self.batch_size] # With transformer diffusers, timestep is a 1D tensor + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyTransformerVisionInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("hidden_states",) + + +class DummyTransformerTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "encoder_hidden_states", + "pooled_projection", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "encoder_hidden_states": + return super().generate(input_name, framework, int_dtype, float_dtype)[0] + + elif input_name == "pooled_projections": + return self.random_float_tensor( + [self.batch_size, self.normalized_config.projection_size], framework=framework, dtype=float_dtype + ) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTransformerVisionInputGenerator(DummyTransformerVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "hidden_states", + "img_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "hidden_states": + shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + elif input_name == "img_ids": + shape = ( + [(self.height // 2) * (self.width // 2), 3] + if check_if_diffusers_greater("0.31.0") + else [self.batch_size, (self.height // 2) * (self.width // 2), 3] + ) + return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "encoder_hidden_states", + "pooled_projections", + "txt_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "txt_ids": + shape = ( + [self.sequence_length, 3] + if check_if_diffusers_greater("0.31.0") + else [self.batch_size, self.sequence_length, 3] + ) + return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index ccccb5510bf..31059c403de 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -297,9 +297,11 @@ } PYTORCH_DIFFUSION_MODEL = { + "flux": "optimum-internal-testing/tiny-random-flux", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", - "latent-consistency": "echarlaix/tiny-random-latent-consistency", } PYTORCH_TIMM_MODEL = { diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index 7671d6cd2e6..88288547c95 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -299,7 +299,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device with TemporaryDirectory() as tmpdirname: _, onnx_outputs = export_models( models_and_onnx_configs=models_and_onnx_configs, - opset=14, output_dir=Path(tmpdirname), device=device, ) @@ -307,7 +306,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device models_and_onnx_configs=models_and_onnx_configs, onnx_named_outputs=onnx_outputs, output_dir=Path(tmpdirname), - atol=1e-4, use_subprocess=False, ) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 956566f0e1f..07f90e8984e 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -34,6 +34,7 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) +from optimum.utils import check_if_transformers_greater from optimum.utils.testing_utils import grid_parameters, require_diffusers @@ -71,7 +72,29 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class ORTPipelineForText2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3", "flux"] + + NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + CALLBACK_SUPPORTED_ARCHITECTURES += ["flux"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -120,8 +143,8 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images @@ -142,12 +165,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -164,6 +187,7 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_callback = Callback() auto_callback = Callback() @@ -171,9 +195,8 @@ def __call__(self, *args, **kwargs) -> None: ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertTrue(auto_callback.has_been_called) @@ -200,10 +223,20 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) + expected_height = height // pipeline.vae_scale_factor + expected_width = width // pipeline.vae_scale_factor + + if model_arch == "flux": + channels = pipeline.transformer.config.in_channels + expected_shape = (batch_size, expected_height * expected_width, channels) + elif model_arch == "stable-diffusion-3": + out_channels = pipeline.transformer.config.out_channels + expected_shape = (batch_size, out_channels, expected_height, expected_width) + else: + out_channels = pipeline.unet.config.out_channels + expected_shape = (batch_size, out_channels, expected_height, expected_width) + + self.assertEqual(outputs.shape, expected_shape) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -224,45 +257,22 @@ def test_image_reproducibility(self, model_arch: str): self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["negative_prompt"] = ["This is a negative prompt"] * batch_size - negative_prompt = ["This is a negative prompt"] - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images - prompt = inputs.pop("prompt") - - if model_arch == "stable-diffusion-xl": - ( - inputs["prompt_embeds"], - inputs["negative_prompt_embeds"], - inputs["pooled_prompt_embeds"], - inputs["negative_pooled_prompt_embeds"], - ) = pipeline.encode_prompt( - prompt=prompt, - num_images_per_prompt=1, - device=torch.device("cpu"), - do_classifier_free_guidance=True, - negative_prompt=negative_prompt, - ) - else: - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( - prompt=prompt, - num_images_per_prompt=1, - device=torch.device("cpu"), - do_classifier_free_guidance=True, - negative_prompt=negative_prompt, - ) - - images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images - - np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand( grid_parameters( @@ -285,9 +295,9 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device.type, "cuda") outputs = pipeline(**inputs).images - self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @@ -326,7 +336,19 @@ def test_safety_checker(self, model_arch: str): class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image @@ -373,14 +395,14 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -398,15 +420,16 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_callback = Callback() auto_callback = Callback() - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) @@ -434,9 +457,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -454,10 +487,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -541,7 +574,17 @@ def test_safety_checker(self, model_arch: str): class ORTPipelineForInpaintingTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + ] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -593,14 +636,14 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -618,15 +661,16 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_callback = Callback() auto_callback = Callback() - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) @@ -654,9 +698,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -674,10 +728,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -719,7 +773,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - self.assertEqual(pipeline.device, "cuda") + self.assertEqual(pipeline.device.type, "cuda") outputs = pipeline(**inputs).images self.assertIsInstance(outputs, np.ndarray) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 84ac27029f9..c4340dcd8b6 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -143,7 +143,7 @@ class ORTModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.TEST_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" - self.LOCAL_MODEL_PATH = "assets/onnx" + self.LOCAL_MODEL_PATH = "tests/assets/onnx" self.ONNX_MODEL_ID = "philschmid/distilbert-onnx" self.TINY_ONNX_MODEL_ID = "fxmarty/resnet-tiny-beans" self.FAIL_ONNX_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py index 34a9504f95a..cf451590fbd 100644 --- a/tests/onnxruntime/test_quantization.py +++ b/tests/onnxruntime/test_quantization.py @@ -42,10 +42,10 @@ class ORTQuantizerTest(unittest.TestCase): LOAD_CONFIGURATION = { "local_asset": { - "model_or_path": "assets/onnx", + "model_or_path": "tests/assets/onnx", }, "local_asset_different_name": { - "model_or_path": "assets/onnx", + "model_or_path": "tests/assets/onnx", "file_name": "different_name.onnx", }, "ort_model_class": { diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 9f200e69b3d..ba8f6cc4abc 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -98,6 +98,7 @@ }, "falcon": "fxmarty/really-tiny-falcon-testing", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "flux": "optimum-internal-testing/tiny-random-flux", "gemma": "fxmarty/tiny-random-GemmaForCausalLM", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", @@ -108,10 +109,10 @@ "groupvit": "hf-internal-testing/tiny-random-groupvit", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", - "levit": "hf-internal-testing/tiny-random-LevitModel", "latent-consistency": "echarlaix/tiny-random-latent-consistency", "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", + "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-LongT5Model", "llama": "optimum-internal-testing/tiny-random-llama", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", @@ -143,6 +144,7 @@ "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", + "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", From d2a5a6aa2adbe9561527a85c4a4947a6d7fcfa58 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 21 Nov 2024 15:03:14 +0100 Subject: [PATCH 39/50] Remove datasets as required dependency (#2087) * remove datasets required dependency * install datasets when needed * add datasets installed when needed * style * add require dataset * divide datasets tests * import datasets only when needed --- .github/workflows/dev_test_benckmark.yml | 2 +- .github/workflows/test_benckmark.yml | 2 +- .github/workflows/test_utils.yml | 11 ++++++++++- optimum/gptq/data.py | 16 ++++++++++++++- optimum/gptq/quantizer.py | 2 +- optimum/onnxruntime/configuration.py | 15 +++++++++----- optimum/onnxruntime/model.py | 9 ++++++--- optimum/onnxruntime/quantization.py | 17 ++++++++++------ optimum/onnxruntime/runs/calibrator.py | 10 ++++++---- optimum/runs_base.py | 8 +++++--- optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 12 ++++++++++++ optimum/utils/preprocessing/base.py | 19 +++++++++++++----- optimum/utils/testing_utils.py | 5 +++++ pyproject.toml | 1 + setup.py | 3 --- tests/utils/test_task_processors.py | 25 +++++++++++++++++++++++- 17 files changed, 123 insertions(+), 35 deletions(-) diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml index a898d288625..381197b129a 100644 --- a/.github/workflows/dev_test_benckmark.yml +++ b/.github/workflows/dev_test_benckmark.yml @@ -23,7 +23,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets pip install -U git+https://github.com/huggingface/evaluate pip install -U git+https://github.com/huggingface/diffusers pip install -U git+https://github.com/huggingface/transformers diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index e859e845d64..fe7df1a20cc 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets - name: Test with unittest run: | python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 0126b023c60..bbe00e62841 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -37,4 +37,13 @@ jobs: - name: Test with pytest working-directory: tests run: | - python -m pytest -s -vvvv utils + pytest utils -s -n auto -m "not datasets_test" --durations=0 + + - name: Install datasets + run: | + pip install datasets + + - name: Tests needing datasets + working-directory: tests + run: | + pytest utils -s -n auto -m "datasets_test" --durations=0 \ No newline at end of file diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index b8734da478e..7e5fc0b43db 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -18,7 +18,12 @@ import numpy as np import torch -from datasets import load_dataset + +from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available + + +if is_datasets_available(): + from datasets import load_dataset """ @@ -113,6 +118,9 @@ def pad_block(block, pads): def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2")) + if split == "train": data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") elif split == "validation": @@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": @@ -157,6 +168,9 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 949d4d260df..849d8821ebf 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -88,7 +88,7 @@ def __init__( dataset (`Union[List[str], str, Any]`, defaults to `None`): The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...]) - or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']. + or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new']. group_size (int, defaults to 128): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. damp_percent (`float`, defaults to `0.1`): diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py index 2e3d9f32d6a..adc1984795a 100644 --- a/optimum/onnxruntime/configuration.py +++ b/optimum/onnxruntime/configuration.py @@ -18,9 +18,8 @@ from dataclasses import asdict, dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from datasets import Dataset from packaging.version import Version, parse from onnxruntime import __version__ as ort_version @@ -33,6 +32,10 @@ from ..utils import logging +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.get_logger(__name__) # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel" @@ -117,7 +120,9 @@ def create_calibrator( class AutoCalibrationConfig: @staticmethod - def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig: + def minmax( + dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01 + ) -> CalibrationConfig: """ Args: dataset (`Dataset`): @@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f @staticmethod def entropy( - dataset: Dataset, + dataset: "Dataset", num_bins: int = 128, num_quantized_bins: int = 128, ) -> CalibrationConfig: @@ -188,7 +193,7 @@ def entropy( ) @staticmethod - def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: + def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: """ Args: dataset (`Dataset`): diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py index caa662f3824..4182abc925f 100644 --- a/optimum/onnxruntime/model.py +++ b/optimum/onnxruntime/model.py @@ -14,10 +14,9 @@ import logging import os -from typing import Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union import numpy as np -from datasets import Dataset from transformers import EvalPrediction from transformers.trainer_pt_utils import nested_concat from transformers.trainer_utils import EvalLoopOutput @@ -25,6 +24,10 @@ from onnxruntime import InferenceSession +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.getLogger(__name__) @@ -59,7 +62,7 @@ def __init__( self.session = InferenceSession(str(model_path), providers=[execution_provider]) self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - def evaluation_loop(self, dataset: Dataset): + def evaluation_loop(self, dataset: "Dataset"): """ Run evaluation and returns metrics and predictions. diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index f637916dcd2..054a2310a6b 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -21,7 +21,6 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import onnx -from datasets import Dataset, load_dataset from packaging.version import Version, parse from transformers import AutoConfig @@ -29,6 +28,7 @@ from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer from onnxruntime.quantization.qdq_quantizer import QDQQuantizer +from optimum.utils.import_utils import requires_backends from ..quantization_base import OptimumQuantizer from ..utils.save_utils import maybe_save_preprocessors @@ -40,6 +40,7 @@ if TYPE_CHECKING: + from datasets import Dataset from transformers import PretrainedConfig LOGGER = logging.getLogger(__name__) @@ -48,7 +49,7 @@ class ORTCalibrationDataReader(CalibrationDataReader): __slots__ = ["batch_size", "dataset", "_dataset_iter"] - def __init__(self, dataset: Dataset, batch_size: int = 1): + def __init__(self, dataset: "Dataset", batch_size: int = 1): if dataset is None: raise ValueError("Provided dataset is None.") @@ -158,7 +159,7 @@ def from_pretrained( def fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -212,7 +213,7 @@ def fit( def partial_fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -428,7 +429,7 @@ def get_calibration_dataset( seed: int = 2016, use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, - ) -> Dataset: + ) -> "Dataset": """ Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -474,6 +475,10 @@ def get_calibration_dataset( "provided." ) + requires_backends(self, ["datasets"]) + + from datasets import load_dataset + calib_dataset = load_dataset( dataset_name, name=dataset_config_name, @@ -492,7 +497,7 @@ def get_calibration_dataset( return self.clean_calibration_dataset(processed_calib_dataset) - def clean_calibration_dataset(self, dataset: Dataset) -> Dataset: + def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset": model = onnx.load(self.onnx_model_path) model_inputs = {input.name for input in model.graph.input} ignored_columns = list(set(dataset.column_names) - model_inputs) diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py index c493a943747..bfdcd64d92e 100644 --- a/optimum/onnxruntime/runs/calibrator.py +++ b/optimum/onnxruntime/runs/calibrator.py @@ -1,6 +1,4 @@ -from typing import Dict, List - -from datasets import Dataset +from typing import TYPE_CHECKING, Dict, List from ...runs_base import Calibrator from .. import ORTQuantizer @@ -9,10 +7,14 @@ from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy +if TYPE_CHECKING: + from datasets import Dataset + + class OnnxRuntimeCalibrator(Calibrator): def __init__( self, - calibration_dataset: Dataset, + calibration_dataset: "Dataset", quantizer: ORTQuantizer, model_path: str, qconfig: QuantizationConfig, diff --git a/optimum/runs_base.py b/optimum/runs_base.py index 3a1d164c602..dadd445818f 100644 --- a/optimum/runs_base.py +++ b/optimum/runs_base.py @@ -2,13 +2,12 @@ import subprocess from contextlib import contextmanager from time import perf_counter_ns -from typing import Set +from typing import TYPE_CHECKING, Set import numpy as np import optuna import torch import transformers -from datasets import Dataset from tqdm import trange from . import version as optimum_version @@ -21,6 +20,9 @@ from .utils.runs import RunConfig, cpu_info_command +if TYPE_CHECKING: + from datasets import Dataset + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -34,7 +36,7 @@ def get_autoclass_name(task): class Calibrator: def __init__( - self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion + self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion ): self.calibration_dataset = calibration_dataset self.quantizer = quantizer diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 40d93d298e4..fb1794af49c 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -35,6 +35,7 @@ check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_onnx_available, is_onnxruntime_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 35a6294ab52..405e3815b33 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -69,6 +69,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _auto_gptq_available = _is_package_available("auto_gptq") _timm_available = _is_package_available("timm") _sentence_transformers_available = _is_package_available("sentence_transformers") +_datasets_available = _is_package_available("datasets") torch_version = None if is_torch_available(): @@ -131,6 +132,10 @@ def is_sentence_transformers_available(): return _sentence_transformers_available +def is_datasets_available(): + return _datasets_available + + def is_auto_gptq_available(): if _auto_gptq_available: version_autogptq = version.parse(importlib_metadata.version("auto_gptq")) @@ -230,6 +235,12 @@ def require_numpy_strictly_lower(package_version: str, message: str): -U transformers`. Please note that you may need to restart your runtime after installation. """ +DATASETS_IMPORT_ERROR = """ +{0} requires the datasets library but it was not found in your environment. You can install it with pip: +`pip install datasets`. Please note that you may need to restart your runtime after installation. +""" + + BACKENDS_MAPPING = OrderedDict( [ ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)), @@ -245,6 +256,7 @@ def require_numpy_strictly_lower(package_version: str, message: str): "transformers_434", (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")), ), + ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ] ) diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py index dc995ccc50b..7cfda13ba7d 100644 --- a/optimum/utils/preprocessing/base.py +++ b/optimum/utils/preprocessing/base.py @@ -20,15 +20,16 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union -from datasets import Dataset, DatasetDict -from datasets import load_dataset as datasets_load_dataset from transformers import PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +from optimum.utils.import_utils import requires_backends + from .. import logging if TYPE_CHECKING: + from datasets import Dataset, DatasetDict from transformers import PretrainedConfig @@ -102,11 +103,14 @@ def create_dataset_processing_func( def prepare_dataset( self, - dataset: Union[DatasetDict, Dataset], + dataset: Union["DatasetDict", "Dataset"], data_keys: Dict[str, str], ref_keys: Optional[List[str]] = None, split: Optional[str] = None, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + from datasets import Dataset + if isinstance(dataset, Dataset) and split is not None: raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.") elif split is not None: @@ -131,7 +135,12 @@ def load_dataset( num_samples: Optional[int] = None, shuffle: bool = False, **load_dataset_kwargs, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + + from datasets import Dataset, DatasetDict + from datasets import load_dataset as datasets_load_dataset + dataset = datasets_load_dataset(path, **load_dataset_kwargs) if isinstance(dataset, DatasetDict) and load_smallest_split: diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 76fe9a05b13..88b1acdb780 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -28,6 +28,7 @@ from . import ( is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_sentence_transformers_available, is_timm_available, @@ -146,6 +147,10 @@ def require_sentence_transformers(test_case): return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case) +def require_datasets(test_case): + return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case) + + def grid_parameters( parameters: Dict[str, Iterable[Any]], yield_dict: bool = False, diff --git a/pyproject.toml b/pyproject.toml index 99a0f1c85fa..17bcd90e066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "rocm_ep_test", "tensorflow_test", "timm_test", + "datasets_test", "run_in_series", "run_slow", "accelerate_test", diff --git a/setup.py b/setup.py index 29f97b604e0..6736085943a 100644 --- a/setup.py +++ b/setup.py @@ -13,14 +13,11 @@ REQUIRED_PKGS = [ - "coloredlogs", - "sympy", "transformers>=4.29", "torch>=1.11", "packaging", "numpy", "huggingface_hub>=0.8.0", - "datasets", ] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index 16567048073..1a9f352a79f 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -19,16 +19,21 @@ from typing import TYPE_CHECKING, Any, Dict, Tuple, Union from unittest import TestCase -from datasets import DatasetDict +import pytest from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer +from optimum.utils.import_utils import is_datasets_available from optimum.utils.preprocessing import TaskProcessorsManager +from optimum.utils.testing_utils import require_datasets if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +if is_datasets_available(): + from datasets import DatasetDict + TEXT_MODEL_NAME = "bert-base-uncased" CONFIG = AutoConfig.from_pretrained(TEXT_MODEL_NAME) @@ -122,6 +127,8 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre ) self.assertDictEqual(preprocessor_kwargs, clone) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_unallowed_data_keys(self): task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)( self.CONFIG, self.PREPROCESSOR @@ -188,15 +195,23 @@ def _test_load_dataset( return dataset + @require_datasets + @pytest.mark.datasets_test def test_load_dataset(self): return self._test_load_dataset(False, False, False) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_by_guessing_data_keys(self): return self._test_load_dataset(False, True, False) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_and_only_keep_necessary_columns(self): return self._test_load_dataset(False, False, True) + @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): return self._test_load_dataset(True, False, False) @@ -207,6 +222,8 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -223,6 +240,8 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -232,6 +251,8 @@ def test_load_dataset_with_max_length(self): input_ids = dataset[0]["input_ids"] self.assertEqual(len(input_ids), max_length) + @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): self.skipTest( "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" @@ -244,6 +265,8 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = 384 dataset = self._test_load_dataset(False, False, True, max_length=max_length) From 65a8a94adaf136dd677d28cfc837c0acfe993031 Mon Sep 17 00:00:00 2001 From: Raghu Ramarao Date: Mon, 25 Nov 2024 18:30:00 +0530 Subject: [PATCH 40/50] Add ONNX Support for Decision Transformer Model (#2038) * Decision Transformer to ONNX V0.1 * Decision Transformer to ONNX V0.2 * Update optimum/exporters/onnx/model_configs.py * Apply suggestions from code review * Update optimum/exporters/onnx/base.py * Update optimum/exporters/onnx/model_configs.py * Update optimum/utils/input_generators.py * Update optimum/exporters/onnx/model_configs.py * Apply suggestions from code review * Update optimum/exporters/tasks.py * ONNXToDT: changes to order of OrderedDict elements * make style changes * test * remove custom normalized config * remove unncessary dynamic axes --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Co-authored-by: IlyasMoutawwakil --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 25 +++++++++++++++++ optimum/exporters/tasks.py | 9 ++++++ optimum/utils/__init__.py | 1 + optimum/utils/input_generators.py | 37 +++++++++++++++++++++++++ tests/exporters/exporters_utils.py | 1 + 6 files changed, 74 insertions(+) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 747e1396fb4..2eaada7dadd 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -36,6 +36,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Data2VecVision - Deberta - Deberta-v2 +- Decision Transformer - Deit - Detr - DistilBert diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 8984162ee8c..bca7cf24acf 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -27,6 +27,7 @@ BloomDummyPastKeyValuesGenerator, DummyAudioInputGenerator, DummyCodegenDecoderTextInputGenerator, + DummyDecisionTransformerInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, DummyFluxTransformerTextInputGenerator, @@ -263,6 +264,30 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig): pass +class DecisionTransformerOnnxConfig(OnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyDecisionTransformerInputGenerator,) + NORMALIZED_CONFIG_CLASS = NormalizedConfig + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "states": {0: "batch_size", 1: "sequence_length"}, + "actions": {0: "batch_size", 1: "sequence_length"}, + "timesteps": {0: "batch_size", 1: "sequence_length"}, + "returns_to_go": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "state_preds": {0: "batch_size", 1: "sequence_length"}, + "action_preds": {0: "batch_size", 1: "sequence_length"}, + "return_preds": {0: "batch_size", 1: "sequence_length"}, + "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, + } + + class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads") diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index b4bce4696f3..8f28ec42ce9 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -217,6 +217,7 @@ class TasksManager: "multiple-choice": "AutoModelForMultipleChoice", "object-detection": "AutoModelForObjectDetection", "question-answering": "AutoModelForQuestionAnswering", + "reinforcement-learning": "AutoModel", "semantic-segmentation": "AutoModelForSemanticSegmentation", "text-to-audio": ("AutoModelForTextToSpectrogram", "AutoModelForTextToWaveform"), "text-generation": "AutoModelForCausalLM", @@ -574,6 +575,11 @@ class TasksManager: onnx="DebertaV2OnnxConfig", tflite="DebertaV2TFLiteConfig", ), + "decision-transformer": supported_tasks_mapping( + "feature-extraction", + "reinforcement-learning", + onnx="DecisionTransformerOnnxConfig", + ), "deit": supported_tasks_mapping( "feature-extraction", "image-classification", @@ -2085,6 +2091,9 @@ def get_model_from_task( if original_task == "automatic-speech-recognition" or task == "automatic-speech-recognition": if original_task == "auto" and config.architectures is not None: model_class_name = config.architectures[0] + elif original_task == "reinforcement-learning" or task == "reinforcement-learning": + if config.architectures is not None: + model_class_name = config.architectures[0] if library_name == "diffusers": config = DiffusionPipeline.load_config(model_name_or_path, **kwargs) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index fb1794af49c..2aa90253d08 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -53,6 +53,7 @@ DummyAudioInputGenerator, DummyBboxInputGenerator, DummyCodegenDecoderTextInputGenerator, + DummyDecisionTransformerInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, DummyFluxTransformerTextInputGenerator, diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 148072aa0b4..0ac1805f97d 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -507,6 +507,43 @@ class DummyDecoderTextInputGenerator(DummyTextInputGenerator): ) +class DummyDecisionTransformerInputGenerator(DummyTextInputGenerator): + """ + Generates dummy decision transformer inputs. + """ + + SUPPORTED_INPUT_NAMES = ( + "states", + "actions", + "timesteps", + "returns_to_go", + "attention_mask", + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.act_dim = self.normalized_config.config.act_dim + self.state_dim = self.normalized_config.config.state_dim + self.max_ep_len = self.normalized_config.config.max_ep_len + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "states": + shape = [self.batch_size, self.sequence_length, self.state_dim] + elif input_name == "actions": + shape = [self.batch_size, self.sequence_length, self.act_dim] + elif input_name == "rewards": + shape = [self.batch_size, self.sequence_length, 1] + elif input_name == "returns_to_go": + shape = [self.batch_size, self.sequence_length, 1] + elif input_name == "attention_mask": + shape = [self.batch_size, self.sequence_length] + elif input_name == "timesteps": + shape = [self.batch_size, self.sequence_length] + return self.random_int_tensor(shape=shape, max_value=self.max_ep_len, framework=framework, dtype=int_dtype) + + return self.random_float_tensor(shape, min_value=-2.0, max_value=2.0, framework=framework, dtype=float_dtype) + + class DummySeq2SeqDecoderTextInputGenerator(DummyDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( "decoder_input_ids", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 31059c403de..c56132c384c 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -67,6 +67,7 @@ "data2vec-audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", "deberta": "hf-internal-testing/tiny-random-DebertaModel", "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", + "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium", "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", From a6c696c7de105e7691d432dd80102beec78d8fd4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 26 Nov 2024 20:52:43 +0100 Subject: [PATCH 41/50] Generate guidance for flux (#2104) generate guidance --- optimum/onnxruntime/modeling_diffusion.py | 17 +++++++++++++++-- optimum/utils/input_generators.py | 4 ++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 79d302be449..66b08e1ef66 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -437,8 +437,21 @@ def to(self, device: Union[torch.device, str, int]): def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): return cls.load_config(config_name_or_path, **kwargs) - def _save_config(self, save_directory): - self.save_config(save_directory) + def _save_config(self, save_directory: Union[str, Path]): + model_dir = ( + self.model_save_dir + if not isinstance(self.model_save_dir, TemporaryDirectory) + else self.model_save_dir.name + ) + save_dir = Path(save_directory) + original_config = Path(model_dir) / self.config_name + if original_config.exists(): + if not save_dir.exists(): + save_dir.mkdir(parents=True) + + shutil.copy(original_config, save_dir) + else: + self.save_config(save_directory) @property def components(self) -> Dict[str, Any]: diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 0ac1805f97d..fbb77e6800a 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -1508,6 +1508,7 @@ class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator) SUPPORTED_INPUT_NAMES = ( "encoder_hidden_states", "pooled_projections", + "guidance", "txt_ids", ) @@ -1519,5 +1520,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int else [self.batch_size, self.sequence_length, 3] ) return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + elif input_name == "guidance": + shape = [self.batch_size] + return self.random_float_tensor(shape, min_value=0, max_value=1, framework=framework, dtype=float_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) From bd08f12d2d4ebffdb2a25e32eabab759e4de88e5 Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:13:11 +0100 Subject: [PATCH 42/50] Unbundle inputs generated by `DummyTimestepInputGenerator` (#2107) unbundle --- optimum/utils/input_generators.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index fbb77e6800a..18a2a5a3fd1 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -897,14 +897,14 @@ def __init__( ): self.task = task self.vocab_size = normalized_config.vocab_size - self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim - self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6 + self.text_encoder_projection_dim = getattr(normalized_config, "text_encoder_projection_dim", None) + self.time_ids = 5 if getattr(normalized_config, "requires_aesthetics_score", False) else 6 if random_batch_size_range: low, high = random_batch_size_range self.batch_size = random.randint(low, high) else: self.batch_size = batch_size - self.time_cond_proj_dim = normalized_config.config.time_cond_proj_dim + self.time_cond_proj_dim = getattr(normalized_config.config, "time_cond_proj_dim", None) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "timestep": @@ -912,8 +912,16 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) if input_name == "text_embeds": + if self.text_encoder_projection_dim is None: + raise ValueError( + "Unable to infer the value of `text_encoder_projection_dim` for generating `text_embeds`, please double check the config of your model." + ) dim = self.text_encoder_projection_dim elif input_name == "timestep_cond": + if self.time_cond_proj_dim is None: + raise ValueError( + "Unable to infer the value of `time_cond_proj_dim` for generating `timestep_cond`, please double check the config of your model." + ) dim = self.time_cond_proj_dim else: dim = self.time_ids From 28bd0ad8fccfb6dd8019cd2882a88d69386a134c Mon Sep 17 00:00:00 2001 From: Brando Tovar <44623235+bndos@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:13:05 -0500 Subject: [PATCH 43/50] Pass the revision to SentenceTransformer models (#2105) feat: pass revision to SentenceTransformers --- optimum/exporters/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 8f28ec42ce9..c50fa5cdfa4 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -2128,6 +2128,7 @@ def get_model_from_task( device=device, cache_folder=cache_folder, token=token, + revision=revision, trust_remote_code=trust_remote_code, ) else: From f22655c036e4e61a7b09748e7aa7e146a16ae64d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Mlyn=C3=A1=C5=99?= <47664722+mlynatom@users.noreply.github.com> Date: Mon, 2 Dec 2024 14:54:08 +0100 Subject: [PATCH 44/50] Add RemBERT ONNX support (#2108) * ONNX config for RemBERT added * added RemBERT to TasksManager * rembert added to exporters_utils * RemBERT added to test modelling tasks * changed rembert model * added RemBERT to test utils * Added RemBERT to documentation * Apply suggestions from code review --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 4 ++++ optimum/exporters/tasks.py | 9 +++++++++ tests/exporters/exporters_utils.py | 3 ++- tests/onnxruntime/test_modeling.py | 5 +++++ tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 2eaada7dadd..57005b85678 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -83,6 +83,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - PoolFormer - Qwen2(Qwen1.5) - RegNet +- RemBERT - ResNet - Roberta - Roformer diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index bca7cf24acf..b39d19ec782 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -162,6 +162,10 @@ class SplinterOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class RemBertOnnxConfig(BertOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + class DistilBertOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0 diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index c50fa5cdfa4..0a3758e97cf 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -431,6 +431,15 @@ class TasksManager: onnx="BertOnnxConfig", tflite="BertTFLiteConfig", ), + "rembert": supported_tasks_mapping( + "fill-mask", + "feature-extraction", + "text-classification", + "multiple-choice", + "token-classification", + "question-answering", + onnx="RemBertOnnxConfig", + ), # For big-bird and bigbird-pegasus being unsupported, refer to model_configs.py # "big-bird": supported_tasks_mapping( # "feature-extraction", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index c56132c384c..32156d9eebf 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -138,6 +138,7 @@ "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", # "rembert": "google/rembert", + "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen2": "fxmarty/tiny-dummy-qwen2", "regnet": "hf-internal-testing/tiny-random-RegNetModel", @@ -257,7 +258,7 @@ "owlv2": "google/owlv2-base-patch16", "owlvit": "google/owlvit-base-patch32", "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. - # "rembert": "google/rembert", + "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "regnet": "facebook/regnet-y-040", "resnet": "microsoft/resnet-50", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index c4340dcd8b6..8f52ef45180 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -1312,6 +1312,7 @@ class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm_qa", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1502,6 +1503,7 @@ class ORTModelForMaskedLMIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1682,6 +1684,7 @@ class ORTModelForSequenceClassificationIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1882,6 +1885,7 @@ class ORTModelForTokenClassificationIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -2227,6 +2231,7 @@ class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index ba8f6cc4abc..cccecd53817 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -135,6 +135,7 @@ "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen2": "fxmarty/tiny-dummy-qwen2", + "rembert": "hf-internal-testing/tiny-random-RemBertModel", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-RobertaModel", "roformer": "hf-internal-testing/tiny-random-RoFormerModel", From 3ba10576e755f8e0740251c891082ee96e722afa Mon Sep 17 00:00:00 2001 From: "Tang, Wenyi" Date: Mon, 2 Dec 2024 22:55:04 +0800 Subject: [PATCH 45/50] Fix `ModelPatcher` returns empty outputs (#2109) * fix bug `ModelPatcher` returns empty outputs When model's output is tuple or list, `filtered_outputs` doesn't get assigned and hence always a empty dict * typo --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/onnx/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index fdfb0e280f5..2c0f9aeba67 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -168,7 +168,7 @@ def patched_forward(*args, **kwargs): filterd_outputs[name] = value elif isinstance(outputs, (list, tuple)): outputs_list = list(config.outputs.keys()) - dict(zip(outputs_list, outputs)) + filterd_outputs = dict(zip(outputs_list, outputs)) else: if len(config.outputs) > 1: num_outputs = len(config.outputs) From ff8c8fc95cb03b6ce72e0812bf0294bb2ae4463a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:00:05 +0100 Subject: [PATCH 46/50] Fix workflow to mark issues as stale (#2110) * add permissions * update stale message --- .github/workflows/stale.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index a5e50a795b6..7b3eb5feb0c 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -6,9 +6,12 @@ on: jobs: stale: runs-on: ubuntu-latest + permissions: + issues: write steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: - stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' + exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' days-before-stale: 30 days-before-close: 5 From 01110adf076c94e395d1472a760eafac2c0a73aa Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:11:16 +0100 Subject: [PATCH 47/50] Remove doc-build (#2111) --- .github/workflows/build_main_documentation.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index c922f5097da..d38274f320a 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -18,12 +18,6 @@ jobs: repository: 'huggingface/doc-builder' path: doc-builder - - uses: actions/checkout@v2 - with: - repository: 'huggingface/doc-build' - path: doc-build - token: ${{ secrets.HUGGINGFACE_PUSH }} - - uses: actions/checkout@v2 with: repository: 'huggingface/optimum' From 7f2605ea94071f5495eac110ba240e2651ea8053 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:19:57 +0100 Subject: [PATCH 48/50] Downgrade stale bot to v8 and fix permissions (#2112) --- .github/workflows/stale.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 7b3eb5feb0c..28cf3ad9dc2 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -3,13 +3,14 @@ on: schedule: - cron: '30 1 * * *' +permissions: + issues: write + jobs: stale: runs-on: ubuntu-latest - permissions: - issues: write steps: - - uses: actions/stale@v9 + - uses: actions/stale@v8 with: stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' From d6de6762e0e4bf8136f0435211a0e777f5bf2f33 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:20:09 +0100 Subject: [PATCH 49/50] Update documentation color from google tpu section (#2113) * Update documentation color from google tpu section * fix --- docs/source/index.mdx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 06133664ca8..1b54570ea80 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -43,7 +43,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem

Accelerate your training and inference workflows with AWS Trainium and AWS Inferentia

Google TPUs
+ >
Google TPUs

Accelerate your training and inference workflows with Google TPUs

-> [!TIP] -> Some packages provide hardware-agnostic features (e.g. INC interface in Optimum Intel). - - ## Open-source integrations 🤗 Optimum also supports a variety of open-source frameworks to make model optimization very easy. From 4a7cb298140ee9bed968d98a780a950d15bb2935 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:04:37 +0100 Subject: [PATCH 50/50] Fix workflow to mark PRs as stale (#2116) --- .github/workflows/stale.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 28cf3ad9dc2..6dc3ff2bbd9 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -5,6 +5,7 @@ on: permissions: issues: write + pull-requests: write jobs: stale: @@ -13,6 +14,10 @@ jobs: - uses: actions/stale@v8 with: stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' + stale-pr-message: 'This PR has been marked as stale because it has been open for 90 days with no activity. This thread will be automatically closed in 30 days if no further activity occurs.' exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' - days-before-stale: 30 - days-before-close: 5 + days-before-issue-stale: 30 + days-before-issue-close: 5 + days-before-pr-stale: 90 + days-before-pr-close: 30 + exempt-all-pr-assignees: true \ No newline at end of file