diff --git a/docs/diffusers/limitations.md b/docs/diffusers/limitations.md index 1b276e85a5..30b6153c71 100644 --- a/docs/diffusers/limitations.md +++ b/docs/diffusers/limitations.md @@ -53,3 +53,74 @@ The table below represents the current support in mindone/diffusers for each of | AttnSkipDownBlock2D | ❌ | ✅ | ✅ | ✅ | contains FirDownsample2D | | SkipDownBlock2D | ❌ | ✅ | ✅ | ✅ | contains FirDownsample2D | | ResnetBlock2D (kernel='fir') | ❌ | ✅ | ✅ | ✅ | ops.Conv2D has poor precision in fp16 and PyNative mode | + +## Pipelines +The table below represents the current support in mindone/diffusers for each of those pipelines in **MindSpore 2.3.0**, +whether they have support in Pynative fp16 mode, Graph fp16 mode, Pynative fp32 mode or Graph fp32 mode. + +> Hint: Due to the precision issue with GroupNorm affecting almost all pipelines under FP16, leading to inference +> precision issues of pipelines, the experiments in the table below default to upcasting GroupNorm to FP32 to avoid +> this issue. + +| **Pipelines** | **Pynative FP16** | **Pynative FP32** | **Graph FP16** | **Graph FP32** | **Description** | +|:------------------------------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| AnimateDiffPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| AnimateDiffVideoToVideoPipeline | :white_check_mark: | :x: | :white_check_mark: | :white_check_mark: | In FP32 and Pynative mode, this pipeline will run out of memory | +| BlipDiffusionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| ConsistencyModelPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| DDIMPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| DDPMPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| DiTPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| I2VGenXLPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | ops.bmm and ops.softmax have precision issues under FP16, so we need to upcast them to FP32 to get a good result | +| IFImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| IFImg2ImgSuperResolutionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| IFInpaintingPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| IFInpaintingSuperResolutionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| IFPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| IFSuperResolutionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| Kandinsky3Img2ImgPipeline | :x: | :x: | :x: | :x: | Kandinsky3 only provides FP16 weights; additionally, T5 has precision issues, so to achieve the desired results, you need to directly input prompt_embeds and attention_mask. | +| Kandinsky3Pipeline | :x: | :x: | :x: | :x: | Kandinsky3 only provides FP16 weights; additionally, T5 has precision issues, so to achieve the desired results, you need to directly input prompt_embeds and attention_mask. | +| KandinskyImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyInpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyV22ControlnetImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyV22ControlnetPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyV22Img2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyV22InpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| KandinskyV22Pipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| LatentConsistencyModelImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| LatentConsistencyModelPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| LDMSuperResolutionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| LDMTextToImagePipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| PixArtAlphaPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| ShapEImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :x: | :x: | The syntax in Render only supports Pynative mode | +| ShapEPipeline | :white_check_mark: | :white_check_mark: | :x: | :x: | The syntax in Render only supports Pynative mode | +| StableCascadePipeline | :x: | :white_check_mark: | :x: | :white_check_mark: | This pipeline does not support FP16 due to precision issues | +| StableDiffusion3Pipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionAdapterPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionControlNetImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionControlNetInpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionControlNetPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionDepth2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionDiffEditPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionGLIGENPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionGLIGENTextImagePipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionImageVariationPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionInpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionInstructPix2PixPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionLatentUpscalePipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionUpscalePipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLAdapterPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLControlNetImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLControlNetInpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLControlNetPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLImg2ImgPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLInpaintPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLInstructPix2PixPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableDiffusionXLPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| StableVideoDiffusionPipeline | :white_check_mark: | :x: | :white_check_mark: | :x: | This pipeline will run out of memory under FP32; ops.bmm and ops.softmax have precision issues under FP16, so we need to upcast them to FP32 to get a good result | +| UnCLIPImageVariationPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| UnCLIPPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | +| WuerstchenPipeline | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | GlobalResponseNorm has precision issue under FP16, so we need to upcast it to FP32 to get a good result | diff --git a/mindone/diffusers/models/downsampling.py b/mindone/diffusers/models/downsampling.py index 0e5ebd2458..f5d268042e 100644 --- a/mindone/diffusers/models/downsampling.py +++ b/mindone/diffusers/models/downsampling.py @@ -292,7 +292,8 @@ def construct(self, inputs: ms.Tensor) -> ms.Tensor: inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1], - ] + ], + dtype=inputs.dtype, ) indices = ops.arange(inputs.shape[1]) kernel = self.kernel.to(weight.dtype)[None, :].broadcast_to((inputs.shape[1], -1, -1)) diff --git a/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index b5b2351640..3dc58bc16a 100644 --- a/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -142,7 +142,7 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt): ) text_encoder_out = self.text_encoder(ms.Tensor(text_input_ids), output_hidden_states=True) - text_embeddings = text_encoder_out[0][-1] + text_embeddings = text_encoder_out[0] text_pooler_out = text_encoder_out[1] # get unconditional embeddings for classifier free guidance diff --git a/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 378260acdc..401c62fdc8 100644 --- a/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -1091,10 +1091,6 @@ def __call__( image = latents if not output_type == "latent": - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: