huggingface · DN6 · Jul 21, 2025 · Jul 22, 2025 · Jul 23, 2025 · Jul 29, 2025
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
@@ -27,7 +27,7 @@
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor, unwrap_module
 from ..modular_pipeline import (
-    PipelineBlock,
+    ModularPipelineBlocks,
     PipelineState,
 )
 from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
@@ -195,7 +195,7 @@ def prepare_latents_img2img(
     return latents
 
 
-class StableDiffusionXLInputStep(PipelineBlock):
+class StableDiffusionXLInputStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -213,11 +213,6 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "prompt_embeds",
                 required=True,
@@ -394,7 +389,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLImg2ImgSetTimestepsStep(PipelineBlock):
+class StableDiffusionXLImg2ImgSetTimestepsStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -421,11 +416,6 @@ def inputs(self) -> List[InputParam]:
             InputParam("denoising_start"),
             # YiYi TODO: do we need num_images_per_prompt here?
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "batch_size",
                 required=True,
@@ -543,7 +533,7 @@ def denoising_value_valid(dnv):
         return components, state
 
 
-class StableDiffusionXLSetTimestepsStep(PipelineBlock):
+class StableDiffusionXLSetTimestepsStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -611,7 +601,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLInpaintPrepareLatentsStep(PipelineBlock):
+class StableDiffusionXLInpaintPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -640,11 +630,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 "`num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of "
                 "`denoising_start` being declared as an integer, the value of `strength` will be ignored.",
             ),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam("generator"),
             InputParam(
                 "batch_size",
@@ -744,8 +729,6 @@ def prepare_latents_inpaint(
         timestep=None,
         is_strength_max=True,
         add_noise=True,
-        return_noise=False,
-        return_image_latents=False,
     ):
         shape = (
             batch_size,
@@ -768,7 +751,7 @@ def prepare_latents_inpaint(
         if image.shape[1] == 4:
             image_latents = image.to(device=device, dtype=dtype)
             image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
-        elif return_image_latents or (latents is None and not is_strength_max):
+        elif latents is None and not is_strength_max:
             image = image.to(device=device, dtype=dtype)
             image_latents = self._encode_vae_image(components, image=image, generator=generator)
             image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
@@ -786,13 +769,7 @@ def prepare_latents_inpaint(
             noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
             latents = image_latents.to(device)
 
-        outputs = (latents,)
-
-        if return_noise:
-            outputs += (noise,)
-
-        if return_image_latents:
-            outputs += (image_latents,)
+        outputs = (latents, noise, image_latents)
 
         return outputs
 
@@ -864,7 +841,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         block_state.height = block_state.image_latents.shape[-2] * components.vae_scale_factor
         block_state.width = block_state.image_latents.shape[-1] * components.vae_scale_factor
 
-        block_state.latents, block_state.noise = self.prepare_latents_inpaint(
+        block_state.latents, block_state.noise, block_state.image_latents = self.prepare_latents_inpaint(
             components,
             block_state.batch_size * block_state.num_images_per_prompt,
             components.num_channels_latents,
@@ -878,8 +855,6 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
             timestep=block_state.latent_timestep,
             is_strength_max=block_state.is_strength_max,
             add_noise=block_state.add_noise,
-            return_noise=True,
-            return_image_latents=False,
         )
 
         # 7. Prepare mask latent variables
@@ -900,7 +875,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLImg2ImgPrepareLatentsStep(PipelineBlock):
+class StableDiffusionXLImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -920,11 +895,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("latents"),
             InputParam("num_images_per_prompt", default=1),
             InputParam("denoising_start"),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam("generator"),
             InputParam(
                 "latent_timestep",
@@ -981,7 +951,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLPrepareLatentsStep(PipelineBlock):
+class StableDiffusionXLPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -1002,11 +972,6 @@ def inputs(self) -> List[InputParam]:
             InputParam("width"),
             InputParam("latents"),
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam("generator"),
             InputParam(
                 "batch_size",
@@ -1092,7 +1057,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep(PipelineBlock):
+class StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -1129,11 +1094,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("num_images_per_prompt", default=1),
             InputParam("aesthetic_score", default=6.0),
             InputParam("negative_aesthetic_score", default=2.0),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam(
                 "latents",
                 required=True,
@@ -1316,7 +1276,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLPrepareAdditionalConditioningStep(PipelineBlock):
+class StableDiffusionXLPrepareAdditionalConditioningStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -1345,11 +1305,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("crops_coords_top_left", default=(0, 0)),
             InputParam("negative_crops_coords_top_left", default=(0, 0)),
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam(
                 "latents",
                 required=True,
@@ -1499,7 +1454,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLControlNetInputStep(PipelineBlock):
+class StableDiffusionXLControlNetInputStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -1527,11 +1482,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("controlnet_conditioning_scale", default=1.0),
             InputParam("guess_mode", default=False),
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "latents",
                 required=True,
@@ -1718,7 +1668,7 @@ def __call__(self, components: StableDiffusionXLModularPipeline, state: Pipeline
         return components, state
 
 
-class StableDiffusionXLControlNetUnionInputStep(PipelineBlock):
+class StableDiffusionXLControlNetUnionInputStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property

diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
@@ -24,7 +24,7 @@
 from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from ...utils import logging
 from ..modular_pipeline import (
-    PipelineBlock,
+    ModularPipelineBlocks,
     PipelineState,
 )
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
@@ -33,7 +33,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class StableDiffusionXLDecodeStep(PipelineBlock):
+class StableDiffusionXLDecodeStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property
@@ -56,17 +56,12 @@ def description(self) -> str:
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
             InputParam("output_type", default="pil"),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "latents",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The denoised latents from the denoising step",
-            )
+            ),
         ]
 
     @property
@@ -157,7 +152,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         return components, state
 
 
-class StableDiffusionXLInpaintOverlayMaskStep(PipelineBlock):
+class StableDiffusionXLInpaintOverlayMaskStep(ModularPipelineBlocks):
     model_name = "stable-diffusion-xl"
 
     @property