add pipeline

huggingface · echarlaix · Jan 30, 2025 · Dec 27, 2024 · Jan 13, 2025 · Jan 13, 2025
commit 6f3deaed9e4cb75a178f683ba1c93256fed087c7
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -105,6 +105,12 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
         ),
     )
+    optional_group.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help=("Select a variant of the model to export."),
+    )
     optional_group.add_argument(
         "--ratio",
         type=float,
@@ -463,5 +469,6 @@ def run(self):
                 stateful=not self.args.disable_stateful,
                 convert_tokenizer=not self.args.disable_convert_tokenizer,
                 library_name=library_name,
+                model_variant=self.args.variant,
                 # **input_shapes,
             )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -122,6 +122,7 @@ def main_export(
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
+    model_variant: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -237,6 +238,8 @@ def main_export(
     custom_architecture = False
     patch_16bit = False
     loading_kwargs = model_loading_kwargs or {}
+    if model_variant is not None:
+        loading_kwargs["variant"] = model_variant
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -347,6 +350,7 @@ class StoreAttr(object):
 
                 GPTQQuantizer.post_init_model = post_init_model
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
+        _loading_kwargs = {} if model_variant is None else {"variant": model_variant}
         dtype = deduce_diffusers_dtype(
             model_name_or_path,
             revision=revision,
@@ -355,6 +359,7 @@ class StoreAttr(object):
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
+            **_loading_kwargs,
         )
         if dtype in [torch.float16, torch.bfloat16]:
             loading_kwargs["torch_dtype"] = dtype
@@ -364,6 +369,7 @@ class StoreAttr(object):
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
         else:
+            logger.warn(loading_kwargs)
             model = TasksManager.get_model_from_task(
                 task,
                 model_name_or_path,

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -1013,6 +1013,7 @@ def _get_submodels_and_export_configs(
 def get_diffusion_models_for_export_ext(
     pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
 ):
+<<<<<<< HEAD
     is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
     is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
     is_flux = pipeline.__class__.__name__.startswith("Flux")
@@ -1035,6 +1036,52 @@ def get_diffusion_models_for_export_ext(
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     elif is_flux:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+=======
+    if is_diffusers_version(">=", "0.29.0"):
+        from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
+
+        sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
+        if is_diffusers_version(">=", "0.30.0"):
+            from diffusers import StableDiffusion3InpaintPipeline
+
+            sd3_pipes.append(StableDiffusion3InpaintPipeline)
+
+        is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
+        logger.warn(f"IS SD3 {pipeline} {is_sd3}")
+    else:
+        is_sd3 = False
+
+    if is_diffusers_version(">=", "0.30.0"):
+        from diffusers import FluxPipeline
+
+        flux_pipes = [FluxPipeline]
+
+        if is_diffusers_version(">=", "0.31.0"):
+            from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline
+
+            flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])
+
+        if is_diffusers_version(">=", "0.32.0"):
+            from diffusers import FluxFillPipeline
+
+            flux_pipes.append(FluxFillPipeline)
+
+        is_flux = isinstance(pipeline, tuple(flux_pipes))
+    else:
+        is_flux = False
+
+    if is_diffusers_version(">=", "0.32.0"):
+        from diffusers import SanaPipeline
+
+        is_sana = isinstance(pipeline, SanaPipeline)
+    else:
+        is_sana = False
+
+    if not any([is_sana, is_flux, is_sd3]):
+        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
+    if is_sd3:
+        models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+>>>>>>> add pipeline
     elif is_sana:
         models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     else:
@@ -1043,17 +1090,15 @@ def get_diffusion_models_for_export_ext(
 
 
 def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
-    DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
-    DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
     models_for_export = {}
     text_encoder = pipeline.text_encoder
     text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=text_encoder,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
-            model_type="gemma2-text-encoder",
-        )
+        model=text_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="feature-extraction",
+        model_type="gemma2-text-encoder",
+    )
     text_encoder_export_config = text_encoder_config_constructor(
         pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
@@ -1076,13 +1121,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     models_for_export["transformer"] = (transformer, transformer_export_config)
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
     vae_encoder = copy.deepcopy(pipeline.vae)
-    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]}
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_encoder,
         exporter=exporter,
         library_name="diffusers",
         task="semantic-segmentation",
-        model_type="vae-encoder",
+        model_type="dcae-encoder",
     )
     vae_encoder_export_config = vae_config_constructor(
         vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
@@ -1140,6 +1185,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
         task="semantic-segmentation",
         model_type="sd3-transformer",
     )
+    logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
     transformer_export_config = export_config_constructor(
         pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
     )

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -41,6 +41,7 @@
     PhiOnnxConfig,
     T5OnnxConfig,
     UNetOnnxConfig,
+    VaeEncoderOnnxConfig,
     VisionOnnxConfig,
     WhisperOnnxConfig,
 )
@@ -57,7 +58,6 @@
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
-    DummySeq2SeqDecoderTextInputGenerator
 )
 from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
 
@@ -1893,52 +1893,78 @@ def rename_ambiguous_inputs(self, inputs):
 class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     pass
 
+
 @register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
 class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
-            "attention_mask": {0: "batch_size", 1: "sequence_length"}
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
         }
 
 
-class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "decoder_input_ids",
         "decoder_attention_mask",
         "encoder_outputs",
         "encoder_hidden_states",
-        "encoder_attention_mask"
+        "encoder_attention_mask",
     )
 
 
-class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name not in ["sample", "latent_sample"]:
-            return super().generate(input_name, framework, int_dtype, float_dtype)
-        return self.random_float_tensor(
-            shape=[self.batch_size, self.num_channels, self.height, self.width],
-            framework=framework,
-            dtype=float_dtype,
-        )
+class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"] // 8,
+        height: int = DEFAULT_DUMMY_SHAPES["height"] // 8,
+        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs)
+
 
 @register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
 class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         image_size="sample_size",
         num_channels="in_channels",
-        hidden_size="cross_attention_dim",
+        hidden_size="caption_channels",
         vocab_size="attention_head_dim",
         allow_new=True,
     )
-    DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummySanaTransformerVisionInputGenerator,
+        DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator,
+    ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+
     @property
     def inputs(self):
         common_inputs = super().inputs
         common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
         return common_inputs
 
+    def rename_ambiguous_inputs(self, inputs):
+        #  The input name in the model signature is `x, hence the export input name is updated.
+        hidden_states = inputs.pop("sample", None)
+        if hidden_states is not None:
+            inputs["hidden_states"] = hidden_states
+        return inputs
+
+
+@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers")
+class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
+        }
+
 
 class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = (

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -127,6 +127,7 @@
         "OVFluxImg2ImgPipeline",
         "OVFluxInpaintPipeline",
         "OVFluxFillPipeline",
+        "OVSanaPipeline",
         "OVPipelineForImage2Image",
         "OVPipelineForText2Image",
         "OVPipelineForInpainting",
@@ -150,6 +151,7 @@
             "OVFluxImg2ImgPipeline",
             "OVFluxInpaintPipeline",
             "OVFluxFillPipeline",
+            "OVSanaPipeline",
             "OVPipelineForImage2Image",
             "OVPipelineForText2Image",
             "OVPipelineForInpainting",

diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
@@ -91,6 +91,7 @@
         OVPipelineForImage2Image,
         OVPipelineForInpainting,
         OVPipelineForText2Image,
+        OVSanaPipeline,
         OVStableDiffusion3Img2ImgPipeline,
         OVStableDiffusion3InpaintPipeline,
         OVStableDiffusion3Pipeline,