diff --git a/mindone/diffusers/models/downsampling.py b/mindone/diffusers/models/downsampling.py index 0e5ebd2458..f5d268042e 100644 --- a/mindone/diffusers/models/downsampling.py +++ b/mindone/diffusers/models/downsampling.py @@ -292,7 +292,8 @@ def construct(self, inputs: ms.Tensor) -> ms.Tensor: inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1], - ] + ], + dtype=inputs.dtype, ) indices = ops.arange(inputs.shape[1]) kernel = self.kernel.to(weight.dtype)[None, :].broadcast_to((inputs.shape[1], -1, -1)) diff --git a/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index b5b2351640..3dc58bc16a 100644 --- a/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/mindone/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -142,7 +142,7 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt): ) text_encoder_out = self.text_encoder(ms.Tensor(text_input_ids), output_hidden_states=True) - text_embeddings = text_encoder_out[0][-1] + text_embeddings = text_encoder_out[0] text_pooler_out = text_encoder_out[1] # get unconditional embeddings for classifier free guidance diff --git a/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 378260acdc..401c62fdc8 100644 --- a/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/mindone/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -1091,10 +1091,6 @@ def __call__( image = latents if not output_type == "latent": - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) if not return_dict: