Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sana support #1106

Merged
merged 15 commits into from
Jan 30, 2025
Prev Previous commit
Next Next commit
add pipeline
eaidova committed Jan 17, 2025
commit 6f3deaed9e4cb75a178f683ba1c93256fed087c7
7 changes: 7 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
@@ -105,6 +105,12 @@ def parse_args_openvino(parser: "ArgumentParser"):
"This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
),
)
optional_group.add_argument(
"--variant",
type=str,
default=None,
help=("Select a variant of the model to export."),
)
optional_group.add_argument(
"--ratio",
type=float,
@@ -463,5 +469,6 @@ def run(self):
stateful=not self.args.disable_stateful,
convert_tokenizer=not self.args.disable_convert_tokenizer,
library_name=library_name,
model_variant=self.args.variant,
# **input_shapes,
)
6 changes: 6 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
@@ -122,6 +122,7 @@ def main_export(
convert_tokenizer: bool = False,
library_name: Optional[str] = None,
model_loading_kwargs: Optional[Dict[str, Any]] = None,
model_variant: Optional[str] = None,
**kwargs_shapes,
):
"""
@@ -237,6 +238,8 @@ def main_export(
custom_architecture = False
patch_16bit = False
loading_kwargs = model_loading_kwargs or {}
if model_variant is not None:
loading_kwargs["variant"] = model_variant
if library_name == "transformers":
config = AutoConfig.from_pretrained(
model_name_or_path,
@@ -347,6 +350,7 @@ class StoreAttr(object):

GPTQQuantizer.post_init_model = post_init_model
elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
_loading_kwargs = {} if model_variant is None else {"variant": model_variant}
dtype = deduce_diffusers_dtype(
model_name_or_path,
revision=revision,
@@ -355,6 +359,7 @@ class StoreAttr(object):
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
**_loading_kwargs,
)
if dtype in [torch.float16, torch.bfloat16]:
loading_kwargs["torch_dtype"] = dtype
@@ -364,6 +369,7 @@ class StoreAttr(object):
if library_name == "open_clip":
model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
else:
logger.warn(loading_kwargs)
model = TasksManager.get_model_from_task(
task,
model_name_or_path,
66 changes: 56 additions & 10 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
@@ -1013,6 +1013,7 @@ def _get_submodels_and_export_configs(
def get_diffusion_models_for_export_ext(
pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
):
<<<<<<< HEAD
is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
is_flux = pipeline.__class__.__name__.startswith("Flux")
@@ -1035,6 +1036,52 @@ def get_diffusion_models_for_export_ext(
models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
elif is_flux:
models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
=======
if is_diffusers_version(">=", "0.29.0"):
from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline

sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
if is_diffusers_version(">=", "0.30.0"):
from diffusers import StableDiffusion3InpaintPipeline

sd3_pipes.append(StableDiffusion3InpaintPipeline)

is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
logger.warn(f"IS SD3 {pipeline} {is_sd3}")
else:
is_sd3 = False

if is_diffusers_version(">=", "0.30.0"):
from diffusers import FluxPipeline

flux_pipes = [FluxPipeline]

if is_diffusers_version(">=", "0.31.0"):
from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline

flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])

if is_diffusers_version(">=", "0.32.0"):
from diffusers import FluxFillPipeline

flux_pipes.append(FluxFillPipeline)

is_flux = isinstance(pipeline, tuple(flux_pipes))
else:
is_flux = False

if is_diffusers_version(">=", "0.32.0"):
from diffusers import SanaPipeline

is_sana = isinstance(pipeline, SanaPipeline)
else:
is_sana = False

if not any([is_sana, is_flux, is_sd3]):
return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
if is_sd3:
models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
>>>>>>> add pipeline
elif is_sana:
models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
else:
@@ -1043,17 +1090,15 @@ def get_diffusion_models_for_export_ext(


def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
models_for_export = {}
text_encoder = pipeline.text_encoder
text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
model=text_encoder,
exporter=exporter,
library_name="diffusers",
task="feature-extraction",
model_type="gemma2-text-encoder",
)
model=text_encoder,
exporter=exporter,
library_name="diffusers",
task="feature-extraction",
model_type="gemma2-text-encoder",
)
text_encoder_export_config = text_encoder_config_constructor(
pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
)
@@ -1076,13 +1121,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
models_for_export["transformer"] = (transformer, transformer_export_config)
# VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
vae_encoder = copy.deepcopy(pipeline.vae)
vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]}
vae_config_constructor = TasksManager.get_exporter_config_constructor(
model=vae_encoder,
exporter=exporter,
library_name="diffusers",
task="semantic-segmentation",
model_type="vae-encoder",
model_type="dcae-encoder",
)
vae_encoder_export_config = vae_config_constructor(
vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
@@ -1140,6 +1185,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
task="semantic-segmentation",
model_type="sd3-transformer",
)
logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
transformer_export_config = export_config_constructor(
pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
)
56 changes: 41 additions & 15 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
@@ -41,6 +41,7 @@
PhiOnnxConfig,
T5OnnxConfig,
UNetOnnxConfig,
VaeEncoderOnnxConfig,
VisionOnnxConfig,
WhisperOnnxConfig,
)
@@ -57,7 +58,6 @@
DummyVisionInputGenerator,
FalconDummyPastKeyValuesGenerator,
MistralDummyPastKeyValuesGenerator,
DummySeq2SeqDecoderTextInputGenerator
)
from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig

@@ -1893,52 +1893,78 @@ def rename_ambiguous_inputs(self, inputs):
class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
pass


@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
@property
def inputs(self) -> Dict[str, Dict[int, str]]:
return {
"input_ids": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"}
"attention_mask": {0: "batch_size", 1: "sequence_length"},
}


class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
SUPPORTED_INPUT_NAMES = (
"decoder_input_ids",
"decoder_attention_mask",
"encoder_outputs",
"encoder_hidden_states",
"encoder_attention_mask"
"encoder_attention_mask",
)


class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name not in ["sample", "latent_sample"]:
return super().generate(input_name, framework, int_dtype, float_dtype)
return self.random_float_tensor(
shape=[self.batch_size, self.num_channels, self.height, self.width],
framework=framework,
dtype=float_dtype,
)
class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator):
def __init__(
self,
task: str,
normalized_config: NormalizedVisionConfig,
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
width: int = DEFAULT_DUMMY_SHAPES["width"] // 8,
height: int = DEFAULT_DUMMY_SHAPES["height"] // 8,
# Reduce img shape by 4 for FLUX to reduce memory usage on conversion
**kwargs,
):
super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs)


@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
image_size="sample_size",
num_channels="in_channels",
hidden_size="cross_attention_dim",
hidden_size="caption_channels",
vocab_size="attention_head_dim",
allow_new=True,
)
DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
DUMMY_INPUT_GENERATOR_CLASSES = (
DummySanaTransformerVisionInputGenerator,
DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator,
) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]

@property
def inputs(self):
common_inputs = super().inputs
common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
return common_inputs

def rename_ambiguous_inputs(self, inputs):
# The input name in the model signature is `x, hence the export input name is updated.
hidden_states = inputs.pop("sample", None)
if hidden_states is not None:
inputs["hidden_states"] = hidden_states
return inputs


@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers")
class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
@property
def outputs(self) -> Dict[str, Dict[int, str]]:
return {
"latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
}


class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
SUPPORTED_INPUT_NAMES = (
2 changes: 2 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
@@ -127,6 +127,7 @@
"OVFluxImg2ImgPipeline",
"OVFluxInpaintPipeline",
"OVFluxFillPipeline",
"OVSanaPipeline",
"OVPipelineForImage2Image",
"OVPipelineForText2Image",
"OVPipelineForInpainting",
@@ -150,6 +151,7 @@
"OVFluxImg2ImgPipeline",
"OVFluxInpaintPipeline",
"OVFluxFillPipeline",
"OVSanaPipeline",
"OVPipelineForImage2Image",
"OVPipelineForText2Image",
"OVPipelineForInpainting",
1 change: 1 addition & 0 deletions optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
@@ -91,6 +91,7 @@
OVPipelineForImage2Image,
OVPipelineForInpainting,
OVPipelineForText2Image,
OVSanaPipeline,
OVStableDiffusion3Img2ImgPipeline,
OVStableDiffusion3InpaintPipeline,
OVStableDiffusion3Pipeline,
Loading