From 7b499de6d04eab1180dd86ab667c6a66a816f0d6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 03:35:15 +0100
Subject: [PATCH 01/23] up

---
 .../modular_pipeline_utils.py                 | 127 +++++++++++++++++-
 .../qwenimage/before_denoise.py               |  40 +++---
 2 files changed, 146 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index aa421a53727b..afc4d6959a6f 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union
-
+import PIL.Image
 import torch
 
 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -342,6 +342,121 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
+    @classmethod
+    def prompt(cls) -> "InputParam":
+        return cls(name="prompt", type_hint=str, required=True,
+                   description="The prompt or prompts to guide image generation.")
+    
+    @classmethod
+    def negative_prompt(cls) -> "InputParam":
+        return cls(name="negative_prompt", type_hint=str, default=None,
+                   description="The prompt or prompts not to guide the image generation.")
+    
+    @classmethod
+    def max_sequence_length(cls, default: int = 512) -> "InputParam":
+        return cls(name="max_sequence_length", type_hint=int, default=default,
+                   description="Maximum sequence length for prompt encoding.")
+    
+    @classmethod
+    def height(cls, default: Optional[int] = None) -> "InputParam":
+        return cls(name="height", type_hint=int, default=default,
+                   description="The height in pixels of the generated image.")
+    
+    @classmethod
+    def width(cls, default: Optional[int] = None) -> "InputParam":
+        return cls(name="width", type_hint=int, default=default,
+                   description="The width in pixels of the generated image.")
+
+    @classmethod
+    def num_inference_steps(cls, default: int = 50) -> "InputParam":
+        return cls(name="num_inference_steps", type_hint=int, default=default,
+                   description="The number of denoising steps.")
+    
+    
+    @classmethod
+    def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
+        return cls(name="num_images_per_prompt", type_hint=int, default=default,
+                   description="The number of images to generate per prompt.")
+    
+    @classmethod
+    def generator(cls) -> "InputParam":
+        return cls(name="generator", type_hint=torch.Generator, default=None,
+                   description="Torch generator for deterministic generation.")
+    
+    
+    @classmethod
+    def sigmas(cls) -> "InputParam":
+        return cls(name="sigmas", type_hint=List[float], default=None,
+                   description="Custom sigmas for the denoising process.")
+    
+    @classmethod
+    def strength(cls, default: float = 0.9) -> "InputParam":
+        return cls(name="strength", type_hint=float, default=default,
+                   description="Strength for img2img/inpainting.")
+    
+    @classmethod
+    def image(cls) -> "InputParam":
+        return cls(name="image", type_hint=PIL.Image.Image, required=True,
+                   description="Input image for img2img, editing, or conditioning.")
+    
+    @classmethod
+    def mask_image(cls) -> "InputParam":
+        return cls(name="mask_image", type_hint=PIL.Image.Image, required=True,
+                   description="Mask image for inpainting.")
+    
+    @classmethod
+    def control_image(cls) -> "InputParam":
+        return cls(name="control_image", type_hint=PIL.Image.Image, required=True,
+                   description="Control image for ControlNet conditioning.")
+    
+    @classmethod
+    def padding_mask_crop(cls) -> "InputParam":
+        return cls(name="padding_mask_crop", type_hint=int, default=None,
+                   description="Padding for mask cropping in inpainting.")
+    
+
+    @classmethod
+    def latents(cls) -> "InputParam":
+        return cls(name="latents", type_hint=torch.Tensor, default=None,
+                   description="Pre-generated noisy latents for image generation.")
+    
+    
+    @classmethod
+    def timesteps(cls) -> "InputParam":
+        return cls(name="timesteps", type_hint=torch.Tensor, default=None,
+                   description="Timesteps for the denoising process.")
+    
+    
+    # =====================================================================
+    # ControlNet
+    # =====================================================================
+
+    @classmethod
+    def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
+        return cls(name="control_guidance_start", type_hint=float, default=default,
+                   description="When to start applying ControlNet.")
+    
+    @classmethod
+    def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
+        return cls(name="control_guidance_end", type_hint=float, default=default,
+                   description="When to stop applying ControlNet.")
+    
+    @classmethod
+    def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
+        return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
+                   description="Scale for ControlNet conditioning.")
+
+    
+    @classmethod
+    def output_type(cls) -> "InputParam":
+        return cls(name="output_type", type_hint=str, default="pil",
+                   description="Output format: 'pil', 'np', 'pt', or 'latent'.")
+    
+    @classmethod
+    def attention_kwargs(cls) -> "InputParam":
+        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
+                   description="Additional kwargs for attention processors.")
+
 
 @dataclass
 class OutputParam:
@@ -357,6 +472,16 @@ def __repr__(self):
             f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
         )
 
+    @classmethod
+    def images(cls) -> "OutputParam":
+        return cls(name="images", type_hint=List[PIL.Image.Image],
+                   description="Generated images.")
+    
+    @classmethod
+    def latents(cls) -> "OutputParam":
+        return cls(name="latents", type_hint=torch.Tensor,
+                   description="Denoised latents.")
+
 
 def format_inputs_short(inputs):
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 0c66d6ea3303..6fa4a971c2c5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -134,11 +134,11 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
+            InputParam.latents(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.num_images_per_prompt(),
+            InputParam.generator(),
             InputParam(
                 name="batch_size",
                 required=True,
@@ -225,12 +225,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="layers", default=4),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
+            InputParam.latents(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam(name="layers", type_hint=int, default=4),
+            InputParam.num_images_per_prompt(),
+            InputParam.generator(),
             InputParam(
                 name="batch_size",
                 required=True,
@@ -466,8 +466,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam(
                 name="latents",
                 required=True,
@@ -532,8 +532,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("num_inference_steps", default=50, type_hint=int),
-            InputParam("sigmas", type_hint=List[float]),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam("image_latents", required=True, type_hint=torch.Tensor),
         ]
 
@@ -590,8 +590,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam(
                 name="latents",
                 required=True,
@@ -971,9 +971,9 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("control_guidance_start", default=0.0),
-            InputParam("control_guidance_end", default=1.0),
-            InputParam("controlnet_conditioning_scale", default=1.0),
+            InputParam.control_guidance_start(),
+            InputParam.control_guidance_end(),
+            InputParam.controlnet_conditioning_scale(),
             InputParam("control_image_latents", required=True),
             InputParam(
                 "timesteps",

From b29873dee72ea60e155a2a14a72e6e6ee6195b63 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:52:53 +0100
Subject: [PATCH 02/23] up up

---
 .../modular_pipeline_utils.py                 | 57 +++++++++++------
 .../qwenimage/before_denoise.py               |  6 +-
 .../modular_pipelines/qwenimage/decoders.py   | 28 +++------
 .../modular_pipelines/qwenimage/denoise.py    | 16 ++---
 .../modular_pipelines/qwenimage/encoders.py   | 61 ++++++++++---------
 .../modular_pipelines/qwenimage/inputs.py     | 34 +++++------
 .../qwenimage/modular_blocks_qwenimage.py     | 26 ++++++--
 .../modular_pipelines/z_image/denoise.py      |  5 +-
 8 files changed, 125 insertions(+), 108 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index afc4d6959a6f..cb179eccc7f7 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -342,6 +342,18 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
+
+    @classmethod
+    def template(cls, name: str) -> Optional["InputParam"]:
+        """Get template for name if exists, otherwise None."""
+        if hasattr(cls, name) and callable(getattr(cls, name)):
+            return getattr(cls, name)()
+        return None
+
+    # ======================================================
+    # InputParam templates
+    # ======================================================
+
     @classmethod
     def prompt(cls) -> "InputParam":
         return cls(name="prompt", type_hint=str, required=True,
@@ -383,7 +395,6 @@ def generator(cls) -> "InputParam":
         return cls(name="generator", type_hint=torch.Generator, default=None,
                    description="Torch generator for deterministic generation.")
     
-    
     @classmethod
     def sigmas(cls) -> "InputParam":
         return cls(name="sigmas", type_hint=List[float], default=None,
@@ -394,6 +405,7 @@ def strength(cls, default: float = 0.9) -> "InputParam":
         return cls(name="strength", type_hint=float, default=default,
                    description="Strength for img2img/inpainting.")
     
+    # images
     @classmethod
     def image(cls) -> "InputParam":
         return cls(name="image", type_hint=PIL.Image.Image, required=True,
@@ -425,12 +437,24 @@ def latents(cls) -> "InputParam":
     def timesteps(cls) -> "InputParam":
         return cls(name="timesteps", type_hint=torch.Tensor, default=None,
                    description="Timesteps for the denoising process.")
+
+    @classmethod
+    def output_type(cls) -> "InputParam":
+        return cls(name="output_type", type_hint=str, default="pil",
+                   description="Output format: 'pil', 'np', 'pt''.")
     
-    
-    # =====================================================================
-    # ControlNet
-    # =====================================================================
+    @classmethod
+    def attention_kwargs(cls) -> "InputParam":
+        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
+                   description="Additional kwargs for attention processors.")
+
+    @classmethod
+    def denoiser_input_fields(cls) -> "InputParam":
+        return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor,
+                   description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.")
+
 
+    # ControlNet
     @classmethod
     def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
         return cls(name="control_guidance_start", type_hint=float, default=default,
@@ -446,18 +470,6 @@ def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
         return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
                    description="Scale for ControlNet conditioning.")
 
-    
-    @classmethod
-    def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil",
-                   description="Output format: 'pil', 'np', 'pt', or 'latent'.")
-    
-    @classmethod
-    def attention_kwargs(cls) -> "InputParam":
-        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
-                   description="Additional kwargs for attention processors.")
-
-
 @dataclass
 class OutputParam:
     """Specification for an output parameter."""
@@ -472,6 +484,17 @@ def __repr__(self):
             f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
         )
 
+    @classmethod
+    def template(cls, name: str) -> Optional["OutputParam"]:
+        """Get template for name if exists, otherwise None."""
+        if hasattr(cls, name) and callable(getattr(cls, name)):
+            return getattr(cls, name)()
+        return None
+
+    # ======================================================
+    # OutputParam templates
+    # ======================================================
+
     @classmethod
     def images(cls) -> "OutputParam":
         return cls(name="images", type_hint=List[PIL.Image.Image],
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 6fa4a971c2c5..d61711e13a52 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -228,7 +228,7 @@ def inputs(self) -> List[InputParam]:
             InputParam.latents(),
             InputParam.height(),
             InputParam.width(),
-            InputParam(name="layers", type_hint=int, default=4),
+            InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"),
             InputParam.num_images_per_prompt(),
             InputParam.generator(),
             InputParam(
@@ -598,7 +598,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The latents to use for the denoising process, used to calculate the image sequence length.",
             ),
-            InputParam(name="strength", default=0.9),
+            InputParam.strength(0.9),
         ]
 
     @property
@@ -886,7 +886,7 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="batch_size", required=True),
-            InputParam(name="layers", required=True),
+            InputParam(name="layers", default=4, description="Number of layers to extract from the image"),
             InputParam(name="height", required=True),
             InputParam(name="width", required=True),
             InputParam(name="prompt_embeds_mask"),
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 24a88ebfca3c..9c3a1c01d018 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -91,7 +91,7 @@ def inputs(self) -> List[InputParam]:
             InputParam("latents", required=True, type_hint=torch.Tensor),
             InputParam("height", required=True, type_hint=int),
             InputParam("width", required=True, type_hint=int),
-            InputParam("layers", required=True, type_hint=int),
+            InputParam("layers", default=4, description="Number of layers to extract from the image"),
         ]
 
     @torch.no_grad()
@@ -141,11 +141,7 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[str]:
         return [
-            OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
-            )
+            OutputParam.images()
         ]
 
     @torch.no_grad()
@@ -198,14 +194,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("output_type", default="pil", type_hint=str),
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"),
+            InputParam.output_type(),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.images(),
         ]
 
     @torch.no_grad()
@@ -273,12 +269,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
-            ),
+            InputParam.output_type(),
         ]
 
     @staticmethod
@@ -323,12 +314,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
-            ),
+            InputParam.output_type(),
             InputParam("mask_overlay_kwargs"),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index eb1e5a341c68..472945b2269a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -218,7 +218,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
+            InputParam.attention_kwargs(),
             InputParam(
                 "latents",
                 required=True,
@@ -231,10 +231,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -322,7 +319,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
+            InputParam.attention_kwargs(),
             InputParam(
                 "latents",
                 required=True,
@@ -335,10 +332,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -424,7 +418,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
+            OutputParam.latents(),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 4b66dd32e521..2eca8645ef2c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -301,8 +301,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
+            InputParam.template(self._image_input_name) or InputParam(
+                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning"
             ),
         ]
 
@@ -381,7 +381,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
+            InputParam.template(self._image_input_name) or InputParam(
                 name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
             ),
             InputParam(
@@ -484,7 +484,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
+            InputParam.template(self._image_input_name) or InputParam(
                 name=self._image_input_name,
                 required=True,
                 type_hint=torch.Tensor,
@@ -564,7 +564,7 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", type_hint=str, description="The prompt to encode"),
+            InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -647,11 +647,9 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
-            InputParam(
-                name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024
-            ),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
+            InputParam.max_sequence_length(1024),
         ]
 
     @property
@@ -772,8 +770,8 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
             InputParam(
                 name="resized_image",
                 required=True,
@@ -895,8 +893,8 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
             InputParam(
                 name="resized_cond_image",
                 required=True,
@@ -1010,11 +1008,11 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("padding_mask_crop"),
+            InputParam.mask_image(),
+            InputParam.image(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.padding_mask_crop(),
         ]
 
     @property
@@ -1082,9 +1080,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("resized_image", required=True),
-            InputParam("padding_mask_crop"),
+            InputParam.mask_image(),
+            InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"),
+            InputParam.padding_mask_crop(),
         ]
 
     @property
@@ -1140,9 +1138,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
+            InputParam.image(),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
     @property
@@ -1312,7 +1310,10 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return [
+            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), 
+            InputParam.generator(),
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -1383,10 +1384,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam("control_image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("generator"),
+            InputParam.control_image(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.generator(),
         ]
         return inputs
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 4a1cf3700c57..e28493ecc369 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -129,7 +129,7 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
             InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
             InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
@@ -269,17 +269,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -398,17 +398,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -544,15 +544,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -638,9 +638,9 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="control_image_latents", required=True),
             InputParam(name="batch_size", required=True),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.num_images_per_prompt(),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 63e9f5a28372..c349c7d9f224 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -54,7 +54,23 @@
 
 
 # ====================
-# 1. VAE ENCODER
+# 1. TEXT ENCODER
+# ====================
+
+class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextEncoderStep()]
+    block_names = ["text_encoder"]
+    block_trigger_inputs = ["prompt"]
+
+    @property
+    def description(self) -> str:
+        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
+        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
+        " - if `prompt` is not provided, step will be skipped."
+
+# ====================
+# 2. VAE ENCODER
 # ====================
 
 
@@ -118,7 +134,7 @@ def description(self):
 
 
 # ====================
-# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================
 
 
@@ -396,7 +412,7 @@ def description(self):
 
 
 # ====================
-# 3. DECODE
+# 4. DECODE
 # ====================
 
 
@@ -439,11 +455,11 @@ def description(self):
 
 
 # ====================
-# 4. AUTO BLOCKS & PRESETS
+# 5. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
     [
-        ("text_encoder", QwenImageTextEncoderStep()),
+        ("text_encoder", QwenImageAutoTextEncoderStep()),
         ("vae_encoder", QwenImageAutoVaeEncoderStep()),
         ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
         ("denoise", QwenImageAutoCoreDenoiseStep()),
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index 3d5a00a9df50..a165fb513f3c 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,10 +129,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
         ]
         guider_input_names = []
         uncond_guider_input_names = []

From 43ab14845d9cbf090e0de0f1f284bdec54008954 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:56:54 +0100
Subject: [PATCH 03/23] update outputs

---
 .../modular_pipelines/qwenimage/modular_blocks_qwenimage.py | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_edit.py              | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_edit_plus.py         | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_layered.py           | 6 ++----
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index f58dffd922fc..e112578c399d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -418,9 +418,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -500,5 +498,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 2683e64080bf..30fcb842d591 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -313,9 +313,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -349,5 +347,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 99c5b109bf38..345b0cd93560 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -144,9 +144,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -196,5 +194,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 63ee36df5112..965f9e1976ad 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -142,9 +142,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -174,5 +172,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]

From 34a743e2dc36dc0ce7a86251ab3c4b74f89beb00 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:57:27 +0100
Subject: [PATCH 04/23] style

---
 .../modular_pipeline_utils.py                 | 191 +++++++++++-------
 .../qwenimage/before_denoise.py               |   4 +-
 .../modular_pipelines/qwenimage/decoders.py   |  15 +-
 .../modular_pipelines/qwenimage/encoders.py   |  27 ++-
 .../qwenimage/modular_blocks_qwenimage.py     |   6 +-
 .../modular_blocks_qwenimage_edit.py          |   5 +-
 .../modular_blocks_qwenimage_edit_plus.py     |   4 -
 .../modular_blocks_qwenimage_layered.py       |   5 -
 8 files changed, 155 insertions(+), 102 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index cb179eccc7f7..fab7c7193e5d 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -17,6 +17,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union
+
 import PIL.Image
 import torch
 
@@ -342,7 +343,6 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
-
     @classmethod
     def template(cls, name: str) -> Optional["InputParam"]:
         """Get template for name if exists, otherwise None."""
@@ -356,119 +356,172 @@ def template(cls, name: str) -> Optional["InputParam"]:
 
     @classmethod
     def prompt(cls) -> "InputParam":
-        return cls(name="prompt", type_hint=str, required=True,
-                   description="The prompt or prompts to guide image generation.")
-    
+        return cls(
+            name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation."
+        )
+
     @classmethod
     def negative_prompt(cls) -> "InputParam":
-        return cls(name="negative_prompt", type_hint=str, default=None,
-                   description="The prompt or prompts not to guide the image generation.")
-    
+        return cls(
+            name="negative_prompt",
+            type_hint=str,
+            default=None,
+            description="The prompt or prompts not to guide the image generation.",
+        )
+
     @classmethod
     def max_sequence_length(cls, default: int = 512) -> "InputParam":
-        return cls(name="max_sequence_length", type_hint=int, default=default,
-                   description="Maximum sequence length for prompt encoding.")
-    
+        return cls(
+            name="max_sequence_length",
+            type_hint=int,
+            default=default,
+            description="Maximum sequence length for prompt encoding.",
+        )
+
     @classmethod
     def height(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(name="height", type_hint=int, default=default,
-                   description="The height in pixels of the generated image.")
-    
+        return cls(
+            name="height", type_hint=int, default=default, description="The height in pixels of the generated image."
+        )
+
     @classmethod
     def width(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(name="width", type_hint=int, default=default,
-                   description="The width in pixels of the generated image.")
+        return cls(
+            name="width", type_hint=int, default=default, description="The width in pixels of the generated image."
+        )
 
     @classmethod
     def num_inference_steps(cls, default: int = 50) -> "InputParam":
-        return cls(name="num_inference_steps", type_hint=int, default=default,
-                   description="The number of denoising steps.")
-    
-    
+        return cls(
+            name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps."
+        )
+
     @classmethod
     def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
-        return cls(name="num_images_per_prompt", type_hint=int, default=default,
-                   description="The number of images to generate per prompt.")
-    
+        return cls(
+            name="num_images_per_prompt",
+            type_hint=int,
+            default=default,
+            description="The number of images to generate per prompt.",
+        )
+
     @classmethod
     def generator(cls) -> "InputParam":
-        return cls(name="generator", type_hint=torch.Generator, default=None,
-                   description="Torch generator for deterministic generation.")
-    
+        return cls(
+            name="generator",
+            type_hint=torch.Generator,
+            default=None,
+            description="Torch generator for deterministic generation.",
+        )
+
     @classmethod
     def sigmas(cls) -> "InputParam":
-        return cls(name="sigmas", type_hint=List[float], default=None,
-                   description="Custom sigmas for the denoising process.")
-    
+        return cls(
+            name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process."
+        )
+
     @classmethod
     def strength(cls, default: float = 0.9) -> "InputParam":
-        return cls(name="strength", type_hint=float, default=default,
-                   description="Strength for img2img/inpainting.")
-    
+        return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.")
+
     # images
     @classmethod
     def image(cls) -> "InputParam":
-        return cls(name="image", type_hint=PIL.Image.Image, required=True,
-                   description="Input image for img2img, editing, or conditioning.")
-    
+        return cls(
+            name="image",
+            type_hint=PIL.Image.Image,
+            required=True,
+            description="Input image for img2img, editing, or conditioning.",
+        )
+
     @classmethod
     def mask_image(cls) -> "InputParam":
-        return cls(name="mask_image", type_hint=PIL.Image.Image, required=True,
-                   description="Mask image for inpainting.")
-    
+        return cls(
+            name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting."
+        )
+
     @classmethod
     def control_image(cls) -> "InputParam":
-        return cls(name="control_image", type_hint=PIL.Image.Image, required=True,
-                   description="Control image for ControlNet conditioning.")
-    
+        return cls(
+            name="control_image",
+            type_hint=PIL.Image.Image,
+            required=True,
+            description="Control image for ControlNet conditioning.",
+        )
+
     @classmethod
     def padding_mask_crop(cls) -> "InputParam":
-        return cls(name="padding_mask_crop", type_hint=int, default=None,
-                   description="Padding for mask cropping in inpainting.")
-    
+        return cls(
+            name="padding_mask_crop",
+            type_hint=int,
+            default=None,
+            description="Padding for mask cropping in inpainting.",
+        )
 
     @classmethod
     def latents(cls) -> "InputParam":
-        return cls(name="latents", type_hint=torch.Tensor, default=None,
-                   description="Pre-generated noisy latents for image generation.")
-    
-    
+        return cls(
+            name="latents",
+            type_hint=torch.Tensor,
+            default=None,
+            description="Pre-generated noisy latents for image generation.",
+        )
+
     @classmethod
     def timesteps(cls) -> "InputParam":
-        return cls(name="timesteps", type_hint=torch.Tensor, default=None,
-                   description="Timesteps for the denoising process.")
+        return cls(
+            name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process."
+        )
 
     @classmethod
     def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil",
-                   description="Output format: 'pil', 'np', 'pt''.")
-    
+        return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.")
+
     @classmethod
     def attention_kwargs(cls) -> "InputParam":
-        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
-                   description="Additional kwargs for attention processors.")
+        return cls(
+            name="attention_kwargs",
+            type_hint=Dict[str, Any],
+            default=None,
+            description="Additional kwargs for attention processors.",
+        )
 
     @classmethod
     def denoiser_input_fields(cls) -> "InputParam":
-        return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor,
-                   description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.")
-
+        return cls(
+            kwargs_type="denoiser_input_fields",
+            type_hint=torch.Tensor,
+            description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+        )
 
     # ControlNet
     @classmethod
     def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
-        return cls(name="control_guidance_start", type_hint=float, default=default,
-                   description="When to start applying ControlNet.")
-    
+        return cls(
+            name="control_guidance_start",
+            type_hint=float,
+            default=default,
+            description="When to start applying ControlNet.",
+        )
+
     @classmethod
     def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
-        return cls(name="control_guidance_end", type_hint=float, default=default,
-                   description="When to stop applying ControlNet.")
-    
+        return cls(
+            name="control_guidance_end",
+            type_hint=float,
+            default=default,
+            description="When to stop applying ControlNet.",
+        )
+
     @classmethod
     def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
-        return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
-                   description="Scale for ControlNet conditioning.")
+        return cls(
+            name="controlnet_conditioning_scale",
+            type_hint=float,
+            default=default,
+            description="Scale for ControlNet conditioning.",
+        )
+
 
 @dataclass
 class OutputParam:
@@ -497,13 +550,11 @@ def template(cls, name: str) -> Optional["OutputParam"]:
 
     @classmethod
     def images(cls) -> "OutputParam":
-        return cls(name="images", type_hint=List[PIL.Image.Image],
-                   description="Generated images.")
-    
+        return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.")
+
     @classmethod
     def latents(cls) -> "OutputParam":
-        return cls(name="latents", type_hint=torch.Tensor,
-                   description="Denoised latents.")
+        return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.")
 
 
 def format_inputs_short(inputs):
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index d61711e13a52..cb808b1d3807 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -228,7 +228,9 @@ def inputs(self) -> List[InputParam]:
             InputParam.latents(),
             InputParam.height(),
             InputParam.width(),
-            InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"),
+            InputParam(
+                name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"
+            ),
             InputParam.num_images_per_prompt(),
             InputParam.generator(),
             InputParam(
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 9c3a1c01d018..8207e99b69ae 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List
 
-import numpy as np
-import PIL
 import torch
 
 from ...configuration_utils import FrozenDict
@@ -140,9 +138,7 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam.images()
-        ]
+        return [OutputParam.images()]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -194,7 +190,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step",
+            ),
             InputParam.output_type(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 2eca8645ef2c..f0dd6471b168 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -301,8 +301,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning"
+            InputParam.template(self._image_input_name)
+            or InputParam(
+                name=self._image_input_name,
+                required=True,
+                type_hint=torch.Tensor,
+                description="Input image for conditioning",
             ),
         ]
 
@@ -381,7 +385,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
+            InputParam.template(self._image_input_name)
+            or InputParam(
                 name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
             ),
             InputParam(
@@ -484,7 +489,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
+            InputParam.template(self._image_input_name)
+            or InputParam(
                 name=self._image_input_name,
                 required=True,
                 type_hint=torch.Tensor,
@@ -564,7 +570,9 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam(
+                name="prompt", type_hint=str, description="The prompt to encode"
+            ),  # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -1081,7 +1089,12 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.mask_image(),
-            InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"),
+            InputParam(
+                "resized_image",
+                required=True,
+                type_hint=PIL.Image.Image,
+                description="The resized image. should be generated using a resize step",
+            ),
             InputParam.padding_mask_crop(),
         ]
 
@@ -1311,7 +1324,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), 
+            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
             InputParam.generator(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index e112578c399d..d6117a12a57d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
@@ -62,6 +58,7 @@
 # 1. TEXT ENCODER
 # ====================
 
+
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
@@ -74,6 +71,7 @@ def description(self) -> str:
         " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
         " - if `prompt` is not provided, step will be skipped."
 
+
 # ====================
 # 2. VAE ENCODER
 # ====================
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 30fcb842d591..14d0945dbe57 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional
-
-import PIL.Image
-import torch
+from typing import Optional
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 345b0cd93560..fbe5e60f353f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 965f9e1976ad..e91a5c40b19b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -13,11 +13,6 @@
 # limitations under the License.
 
 
-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam

From ff09bf1a631e38683205217e8dba4961de090319 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 11:55:03 +0100
Subject: [PATCH 05/23] add modular_auto_docstring!

---
 .../qwenimage/modular_blocks_qwenimage.py     | 814 +++++++++++++++++-
 utils/modular_auto_docstring.py               | 296 +++++++
 2 files changed, 1104 insertions(+), 6 deletions(-)
 create mode 100644 utils/modular_auto_docstring.py

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index d6117a12a57d..19feffe77eda 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -58,8 +58,59 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageAutoTextEncoderStep
+
+      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+
+      Components:
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
     block_names = ["text_encoder"]
@@ -76,8 +127,54 @@ def description(self) -> str:
 # 2. VAE ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintVaeEncoderStep
+
+      This step is used for processing image and mask inputs for inpainting tasks. It:
+       - Resizes the image to the target size, based on `height` and `width`.
+       - Processes and updates `image` and `mask_image`.
+       - Creates `image_latents`.
+
+      Components:
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          mask_image (`Image`):
+              Mask image for inpainting.
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          processed_mask_image (`None`):
+
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -92,7 +189,40 @@ def description(self) -> str:
         )
 
 
+#auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgVaeEncoderStep
+
+      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage"
 
     block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -103,7 +233,6 @@ def description(self) -> str:
         return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 
 
-# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
     block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
     block_names = ["inpaint", "img2img"]
@@ -121,7 +250,43 @@ def description(self):
 
 
 # optional controlnet vae encoder
+#auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageOptionalControlNetVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
+       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
+       - if `control_image` is not provided, step will be skipped.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          control_image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
     block_classes = [QwenImageControlNetVaeEncoderStep]
     block_names = ["controlnet"]
     block_trigger_inputs = ["control_image"]
@@ -142,7 +307,52 @@ def description(self):
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgInputStep
+
+      Input step that prepares the inputs for the img2img denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
     block_names = ["text_inputs", "additional_inputs"]
@@ -154,7 +364,54 @@ def description(self):
         " - update height/width based `image_latents`, patchify `image_latents`."
 
 
+#auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintInputStep
+
+      Input step that prepares the inputs for the inpainting denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -172,7 +429,49 @@ def description(self):
 
 
 # assemble prepare latents steps
+#auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintPrepareLatentsStep
+
+      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the pachified latents `mask` based on the processedmask image.
+
+      Components:
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+
+          height (`None`):
+
+          width (`None`):
+
+          dtype (`None`):
+
+      Outputs:
+
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -190,7 +489,66 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image)
+#auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -212,10 +570,81 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (inpainting)
+#auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -240,9 +669,78 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (image2image)
+#auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -267,9 +765,87 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (text2image) with controlnet
+#auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -295,10 +871,95 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+ 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (inpainting) with controlnet
+#auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -327,9 +988,93 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Qwen Image (image2image) with controlnet
+#auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -357,7 +1102,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -426,7 +1176,32 @@ def outputs(self):
 
 
 # standard decode step works for most tasks except for inpaint
+#auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -437,7 +1212,34 @@ def description(self):
 
 
 # Inpaint decode step
+#auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
new file mode 100644
index 000000000000..c6aaf8a46a56
--- /dev/null
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Auto Docstring Generator for Modular Pipeline Blocks
+
+This script scans Python files for classes that have `# auto_docstring` comment above them
+and inserts/updates the docstring from the class's `doc` property.
+
+Run from the root of the repo:
+    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
+
+Examples:
+    # Check for auto_docstring markers (will error if found without proper docstring)
+    python utils/modular_auto_docstring.py
+
+    # Check specific directory
+    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
+
+    # Fix and overwrite the docstrings
+    python utils/modular_auto_docstring.py --fix_and_overwrite
+
+Usage in code:
+    # auto_docstring
+    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+        # docstring will be automatically inserted here
+        
+        @property
+        def doc(self):
+            return "Your docstring content..."
+"""
+
+import argparse
+import ast
+import glob
+import importlib
+import os
+import re
+import sys
+
+
+# All paths are set with the intent you should run this script from the root of the repo
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+# Pattern to match the auto_docstring comment
+AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
+
+
+def setup_diffusers_import():
+    """Setup import path to use the local diffusers module."""
+    src_path = os.path.join(REPO_PATH, "src")
+    if src_path not in sys.path:
+        sys.path.insert(0, src_path)
+
+
+def get_module_from_filepath(filepath: str) -> str:
+    """Convert a filepath to a module name."""
+    filepath = os.path.normpath(filepath)
+    
+    if filepath.startswith("src" + os.sep):
+        filepath = filepath[4:]
+    
+    if filepath.endswith(".py"):
+        filepath = filepath[:-3]
+    
+    module_name = filepath.replace(os.sep, ".")
+    return module_name
+
+
+def load_module(filepath: str):
+    """Load a module from filepath."""
+    setup_diffusers_import()
+    module_name = get_module_from_filepath(filepath)
+    
+    try:
+        module = importlib.import_module(module_name)
+        return module
+    except Exception as e:
+        print(f"Warning: Could not import module {module_name}: {e}")
+        return None
+
+
+def get_doc_from_class(module, class_name: str) -> str:
+    """Get the doc property from an instantiated class."""
+    if module is None:
+        return None
+    
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        return None
+    
+    try:
+        instance = cls()
+        if hasattr(instance, "doc"):
+            return instance.doc
+    except Exception as e:
+        print(f"Warning: Could not instantiate {class_name}: {e}")
+    
+    return None
+
+
+def find_auto_docstring_classes(filepath: str) -> list:
+    """
+    Find all classes in a file that have # auto_docstring comment above them.
+    
+    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
+    """
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Parse AST to find class locations and their docstrings
+    content = "".join(lines)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        print(f"Syntax error in {filepath}: {e}")
+        return []
+    
+    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
+    class_info = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            has_docstring = False
+            docstring_end_line = node.lineno  # default to class line
+            
+            if node.body and isinstance(node.body[0], ast.Expr):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
+                    has_docstring = True
+                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
+            
+            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
+    
+    # Now scan for # auto_docstring comments
+    classes_to_update = []
+    
+    for i, line in enumerate(lines):
+        if AUTO_DOCSTRING_PATTERN.match(line):
+            # Found the marker, look for class definition on next non-empty, non-comment line
+            j = i + 1
+            while j < len(lines):
+                next_line = lines[j].strip()
+                if next_line and not next_line.startswith("#"):
+                    break
+                j += 1
+            
+            if j < len(lines) and lines[j].strip().startswith("class "):
+                # Extract class name
+                match = re.match(r"class\s+(\w+)", lines[j].strip())
+                if match:
+                    class_name = match.group(1)
+                    if class_name in class_info:
+                        class_line, has_docstring, docstring_end_line = class_info[class_name]
+                        classes_to_update.append((
+                            class_name,
+                            class_line,
+                            has_docstring,
+                            docstring_end_line
+                        ))
+    
+    return classes_to_update
+
+
+def format_docstring(doc: str, indent: str = "    ") -> str:
+    """Format a doc string as a properly indented docstring."""
+    lines = doc.strip().split("\n")
+    
+    if len(lines) == 1:
+        return f'{indent}"""{lines[0]}"""\n'
+    else:
+        result = [f'{indent}"""\n']
+        for line in lines:
+            if line.strip():
+                result.append(f"{indent}{line}\n")
+            else:
+                result.append("\n")
+        result.append(f'{indent}"""\n')
+        return "".join(result)
+
+
+def process_file(filepath: str, overwrite: bool = False) -> list:
+    """
+    Process a file and find/insert docstrings for # auto_docstring marked classes.
+    
+    Returns list of classes that need updating.
+    """
+    classes_to_update = find_auto_docstring_classes(filepath)
+    
+    if not classes_to_update:
+        return []
+    
+    if not overwrite:
+        # Just return the list of classes that need updating
+        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+    
+    # Load the module to get doc properties
+    module = load_module(filepath)
+    
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Process in reverse order to maintain line numbers
+    updated = False
+    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
+        doc = get_doc_from_class(module, class_name)
+        
+        if doc is None:
+            print(f"Warning: Could not get doc for {class_name} in {filepath}")
+            continue
+        
+        # Format the new docstring with 4-space indent
+        new_docstring = format_docstring(doc, "    ")
+        
+        if has_docstring:
+            # Replace existing docstring (line after class definition to docstring_end_line)
+            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
+            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
+        else:
+            # Insert new docstring right after class definition line
+            # class_line is 1-indexed, so lines[class_line-1] is the class line
+            # Insert at position class_line (which is right after the class line)
+            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
+        
+        updated = True
+        print(f"Updated docstring for {class_name} in {filepath}")
+    
+    if updated:
+        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+    
+    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+
+def check_auto_docstrings(path: str = None, overwrite: bool = False):
+    """
+    Check all files for # auto_docstring markers and optionally fix them.
+    """
+    if path is None:
+        path = DIFFUSERS_PATH
+    
+    if os.path.isfile(path):
+        all_files = [path]
+    else:
+        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
+    
+    all_markers = []
+    
+    for filepath in all_files:
+        markers = process_file(filepath, overwrite)
+        all_markers.extend(markers)
+    
+    if not overwrite and len(all_markers) > 0:
+        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
+        raise ValueError(
+            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
+            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
+        )
+    
+    if overwrite and len(all_markers) > 0:
+        print(f"\nUpdated {len(all_markers)} docstring(s).")
+    elif len(all_markers) == 0:
+        print("No # auto_docstring markers found.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and fix # auto_docstring markers in modular pipeline blocks",
+    )
+    parser.add_argument(
+        "path",
+        nargs="?",
+        default=None,
+        help="File or directory to process (default: src/diffusers)"
+    )
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix the docstrings by inserting them from doc property.",
+    )
+    
+    args = parser.parse_args()
+    
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file

From d20f413f78822e9513bd60c203bf0f58885b3a54 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:11:28 +0100
Subject: [PATCH 06/23] more auto docstring

---
 .../modular_blocks_qwenimage_edit.py          | 471 +++++++++++++++++-
 .../modular_blocks_qwenimage_edit_plus.py     | 226 ++++++++-
 .../modular_blocks_qwenimage_layered.py       | 245 ++++++++-
 3 files changed, 935 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 14d0945dbe57..cae6236eb5aa 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -55,9 +55,62 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
+    """
+    class QwenImageEditVLEncoderStep
+
+      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-edit"
     block_classes = [
@@ -77,7 +130,39 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
+#auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -92,7 +177,53 @@ def description(self) -> str:
 
 
 # Edit Inpaint VAE encoder
+#auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintVaeEncoderStep
+
+      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
+       - process the resized image and mask image.
+       - create image latents.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          mask_image (`Image`):
+              Mask image for inpainting.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          processed_mask_image (`None`):
+
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -134,7 +265,54 @@ def description(self):
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInputStep
+
+      Input step that prepares the inputs for the edit denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -151,7 +329,56 @@ def description(self):
         )
 
 
+#auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintInputStep
+
+      Input step that prepares the inputs for the edit inpaint denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -171,7 +398,49 @@ def description(self):
 
 
 # assemble prepare latents steps
+#auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintPrepareLatentsStep
+
+      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the patchified latents `mask` based on the processed mask image.
+
+      Components:
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+
+          height (`None`):
+
+          width (`None`):
+
+          dtype (`None`):
+
+      Outputs:
+
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -186,7 +455,68 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image) core denoise step
+#auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInputStep(),
@@ -209,9 +539,81 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Qwen Image Edit (inpainting) core denoise step
+#auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit edit inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInpaintInputStep(),
@@ -236,6 +638,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -263,7 +671,12 @@ def description(self):
             " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
             "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
         )
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # ====================
 # 4. DECODE
@@ -271,7 +684,32 @@ def description(self):
 
 
 # Decode step (standard)
+#auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -282,7 +720,34 @@ def description(self):
 
 
 # Inpaint decode step
+#auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index fbe5e60f353f..2fcd633f0d7f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -49,8 +49,64 @@
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    """
+    class QwenImageEditPlusVLEncoderStep
+
+      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+
+          resized_cond_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
@@ -69,8 +125,40 @@ def description(self) -> str:
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    """
+    class QwenImageEditPlusVaeEncoderStep
+
+      VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
@@ -94,7 +182,56 @@ def description(self) -> str:
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusInputStep
+
+      Input step that prepares the inputs for the Edit Plus denoising step. It:
+       - Standardizes text embeddings batch size.
+       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
+       - Outputs lists of image_height/image_width for RoPE calculation.
+       - Defaults height/width from last image in the list.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -114,7 +251,67 @@ def description(self):
 
 
 # Qwen Image Edit Plus (image2image) core denoise step
+#auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageEditPlusInputStep(),
@@ -149,7 +346,32 @@ def outputs(self):
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusDecodeStep
+
+      Decode step that decodes the latents to images and postprocesses the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index e91a5c40b19b..f647f16868ab 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -49,9 +49,111 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
+    """
+    class QwenImageLayeredTextEncoderStep
+
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          image_caption_prompt_en (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
+    1. Write the caption using natural, descriptive language without structured formats or rich text.
+    2. Enrich caption details by including:
+     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+    3. Maintain authenticity and accuracy:
+     - Avoid generalizations
+     - Describe all visible information in the image, while do not add information not explicitly shown in the image
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          image_caption_prompt_cn (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
+    2. 通过加入以下内容，丰富图注细节：
+     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
+     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
+     - 环境细节：例如天气、光照、颜色、纹理、气氛等
+     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
+    3. 保持真实性与准确性：
+     - 不要使用笼统的描述
+     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          prompt (`str`, *optional*):
+              The prompt to encode
+
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-layered"
     block_classes = [
@@ -72,7 +174,42 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
+#auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredResizeStep(),
@@ -93,7 +230,54 @@ def description(self) -> str:
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredInputStep
+
+      Input step that prepares the inputs for the layered denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+
+          height (`int`):
+              The height of the image output
+
+          width (`int`):
+              The width of the image output
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -111,7 +295,64 @@ def description(self):
 
 
 # Qwen Image Layered (image2image) core denoise step
+#auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Layered img2img task.
+
+      Components:
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredInputStep(),

From 2a81f2ec5417efdc7773937dd7db2f675a46b66a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:15:36 +0100
Subject: [PATCH 07/23] style

---
 .../qwenimage/modular_blocks_qwenimage.py     | 86 ++++++++++++-------
 .../modular_blocks_qwenimage_edit.py          | 46 ++++++----
 .../modular_blocks_qwenimage_edit_plus.py     | 26 +++---
 .../modular_blocks_qwenimage_layered.py       | 47 +++++-----
 4 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 19feffe77eda..d54dca5f5ad6 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -58,7 +58,8 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageAutoTextEncoderStep
@@ -76,11 +77,8 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -111,6 +109,7 @@ class QwenImageAutoTextEncoderStep
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
     block_names = ["text_encoder"]
@@ -127,7 +126,8 @@ def description(self) -> str:
 # 2. VAE ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintVaeEncoderStep
@@ -175,6 +175,7 @@ class QwenImageInpaintVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -189,7 +190,7 @@ def description(self) -> str:
         )
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgVaeEncoderStep
@@ -223,6 +224,7 @@ class QwenImageImg2ImgVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage"
 
     block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -250,13 +252,12 @@ def description(self):
 
 
 # optional controlnet vae encoder
-#auto_docstring
+# auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations.
-      This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -287,6 +288,7 @@ class QwenImageOptionalControlNetVaeEncoderStep
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
+
     block_classes = [QwenImageControlNetVaeEncoderStep]
     block_names = ["controlnet"]
     block_trigger_inputs = ["control_image"]
@@ -307,7 +309,7 @@ def description(self):
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgInputStep
@@ -353,6 +355,7 @@ class QwenImageImg2ImgInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
     block_names = ["text_inputs", "additional_inputs"]
@@ -364,7 +367,7 @@ def description(self):
         " - update height/width based `image_latents`, patchify `image_latents`."
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintInputStep
@@ -412,6 +415,7 @@ class QwenImageInpaintInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -429,7 +433,7 @@ def description(self):
 
 
 # assemble prepare latents steps
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintPrepareLatentsStep
@@ -450,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -472,6 +477,7 @@ class QwenImageInpaintPrepareLatentsStep
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -489,12 +495,13 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image)
-#auto_docstring
+# auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -570,20 +577,22 @@ class QwenImageCoreDenoiseStep
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (inpainting)
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -675,13 +684,15 @@ def outputs(self):
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (image2image)
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -771,13 +782,15 @@ def outputs(self):
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (text2image) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -871,20 +884,22 @@ class QwenImageControlNetCoreDenoiseStep
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
- 
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (inpainting) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -996,12 +1011,13 @@ def outputs(self):
 
 
 # Qwen Image (image2image) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -1102,13 +1118,14 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep
     @property
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
     block_classes = [
@@ -1176,7 +1193,7 @@ def outputs(self):
 
 
 # standard decode step works for most tasks except for inpaint
-#auto_docstring
+# auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageDecodeStep
@@ -1202,6 +1219,7 @@ class QwenImageDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -1212,12 +1230,13 @@ def description(self):
 
 
 # Inpaint decode step
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+      overally to the original image.
 
       Components:
 
@@ -1240,6 +1259,7 @@ class QwenImageInpaintDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index cae6236eb5aa..37a438ea1f54 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -55,7 +55,8 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditVLEncoderStep
@@ -75,11 +76,10 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -130,7 +130,7 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditVaeEncoderStep
@@ -163,6 +163,7 @@ class QwenImageEditVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -177,7 +178,7 @@ def description(self) -> str:
 
 
 # Edit Inpaint VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintVaeEncoderStep
@@ -224,6 +225,7 @@ class QwenImageEditInpaintVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -265,7 +267,7 @@ def description(self):
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInputStep
@@ -313,6 +315,7 @@ class QwenImageEditInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -329,7 +332,7 @@ def description(self):
         )
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintInputStep
@@ -379,6 +382,7 @@ class QwenImageEditInpaintInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -398,7 +402,7 @@ def description(self):
 
 
 # assemble prepare latents steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintPrepareLatentsStep
@@ -419,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -441,6 +446,7 @@ class QwenImageEditInpaintPrepareLatentsStep
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -455,7 +461,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditCoreDenoiseStep
@@ -547,7 +553,7 @@ def outputs(self):
 
 
 # Qwen Image Edit (inpainting) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintCoreDenoiseStep
@@ -671,20 +677,21 @@ def description(self):
             " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
             "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
         )
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # ====================
 # 4. DECODE
 # ====================
 
 
 # Decode step (standard)
-#auto_docstring
+# auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditDecodeStep
@@ -710,6 +717,7 @@ class QwenImageEditDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -720,12 +728,13 @@ def description(self):
 
 
 # Inpaint decode step
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+      overlay to the original image.
 
       Components:
 
@@ -748,6 +757,7 @@ class QwenImageEditInpaintDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 2fcd633f0d7f..851b69f232e7 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -49,7 +49,7 @@
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVLEncoderStep
@@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -125,13 +124,13 @@ def description(self) -> str:
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations.
-      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
+      on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -182,7 +181,7 @@ def description(self) -> str:
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusInputStep
@@ -232,6 +231,7 @@ class QwenImageEditPlusInputStep
           image_width (`List`):
               The image widths calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -251,7 +251,7 @@ def description(self):
 
 
 # Qwen Image Edit Plus (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusCoreDenoiseStep
@@ -312,6 +312,7 @@ class QwenImageEditPlusCoreDenoiseStep
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageEditPlusInputStep(),
@@ -346,7 +347,7 @@ def outputs(self):
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusDecodeStep
@@ -372,6 +373,7 @@ class QwenImageEditPlusDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index f647f16868ab..56fa1345a5ce 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -49,12 +49,14 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+      provided.
 
       Components:
 
@@ -71,28 +73,23 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -102,16 +99,11 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -174,7 +166,7 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredVaeEncoderStep
@@ -210,6 +202,7 @@ class QwenImageLayeredVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredResizeStep(),
@@ -230,7 +223,7 @@ def description(self) -> str:
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredInputStep
@@ -278,6 +271,7 @@ class QwenImageLayeredInputStep
           width (`int`):
               The width of the image output
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -295,7 +289,7 @@ def description(self):
 
 
 # Qwen Image Layered (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredCoreDenoiseStep
@@ -353,6 +347,7 @@ class QwenImageLayeredCoreDenoiseStep
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredInputStep(),

From f0555af1c6be0adb75404f2724a071d8b49b5506 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:15:53 +0100
Subject: [PATCH 08/23] up up up

---
 utils/modular_auto_docstring.py | 90 +++++++++++++++------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index c6aaf8a46a56..e2d523b2f378 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -36,7 +36,7 @@
     # auto_docstring
     class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
         # docstring will be automatically inserted here
-        
+
         @property
         def doc(self):
             return "Your docstring content..."
@@ -69,13 +69,13 @@ def setup_diffusers_import():
 def get_module_from_filepath(filepath: str) -> str:
     """Convert a filepath to a module name."""
     filepath = os.path.normpath(filepath)
-    
+
     if filepath.startswith("src" + os.sep):
         filepath = filepath[4:]
-    
+
     if filepath.endswith(".py"):
         filepath = filepath[:-3]
-    
+
     module_name = filepath.replace(os.sep, ".")
     return module_name
 
@@ -84,7 +84,7 @@ def load_module(filepath: str):
     """Load a module from filepath."""
     setup_diffusers_import()
     module_name = get_module_from_filepath(filepath)
-    
+
     try:
         module = importlib.import_module(module_name)
         return module
@@ -97,30 +97,30 @@ def get_doc_from_class(module, class_name: str) -> str:
     """Get the doc property from an instantiated class."""
     if module is None:
         return None
-    
+
     cls = getattr(module, class_name, None)
     if cls is None:
         return None
-    
+
     try:
         instance = cls()
         if hasattr(instance, "doc"):
             return instance.doc
     except Exception as e:
         print(f"Warning: Could not instantiate {class_name}: {e}")
-    
+
     return None
 
 
 def find_auto_docstring_classes(filepath: str) -> list:
     """
     Find all classes in a file that have # auto_docstring comment above them.
-    
+
     Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
     """
     with open(filepath, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
-    
+
     # Parse AST to find class locations and their docstrings
     content = "".join(lines)
     try:
@@ -128,25 +128,25 @@ def find_auto_docstring_classes(filepath: str) -> list:
     except SyntaxError as e:
         print(f"Syntax error in {filepath}: {e}")
         return []
-    
+
     # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
     class_info = {}
     for node in ast.walk(tree):
         if isinstance(node, ast.ClassDef):
             has_docstring = False
             docstring_end_line = node.lineno  # default to class line
-            
+
             if node.body and isinstance(node.body[0], ast.Expr):
                 first_stmt = node.body[0]
                 if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
                     has_docstring = True
                     docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
-            
+
             class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
-    
+
     # Now scan for # auto_docstring comments
     classes_to_update = []
-    
+
     for i, line in enumerate(lines):
         if AUTO_DOCSTRING_PATTERN.match(line):
             # Found the marker, look for class definition on next non-empty, non-comment line
@@ -156,7 +156,7 @@ def find_auto_docstring_classes(filepath: str) -> list:
                 if next_line and not next_line.startswith("#"):
                     break
                 j += 1
-            
+
             if j < len(lines) and lines[j].strip().startswith("class "):
                 # Extract class name
                 match = re.match(r"class\s+(\w+)", lines[j].strip())
@@ -164,20 +164,15 @@ def find_auto_docstring_classes(filepath: str) -> list:
                     class_name = match.group(1)
                     if class_name in class_info:
                         class_line, has_docstring, docstring_end_line = class_info[class_name]
-                        classes_to_update.append((
-                            class_name,
-                            class_line,
-                            has_docstring,
-                            docstring_end_line
-                        ))
-    
+                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
+
     return classes_to_update
 
 
 def format_docstring(doc: str, indent: str = "    ") -> str:
     """Format a doc string as a properly indented docstring."""
     lines = doc.strip().split("\n")
-    
+
     if len(lines) == 1:
         return f'{indent}"""{lines[0]}"""\n'
     else:
@@ -194,36 +189,36 @@ def format_docstring(doc: str, indent: str = "    ") -> str:
 def process_file(filepath: str, overwrite: bool = False) -> list:
     """
     Process a file and find/insert docstrings for # auto_docstring marked classes.
-    
+
     Returns list of classes that need updating.
     """
     classes_to_update = find_auto_docstring_classes(filepath)
-    
+
     if not classes_to_update:
         return []
-    
+
     if not overwrite:
         # Just return the list of classes that need updating
         return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
-    
+
     # Load the module to get doc properties
     module = load_module(filepath)
-    
+
     with open(filepath, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
-    
+
     # Process in reverse order to maintain line numbers
     updated = False
     for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
         doc = get_doc_from_class(module, class_name)
-        
+
         if doc is None:
             print(f"Warning: Could not get doc for {class_name} in {filepath}")
             continue
-        
+
         # Format the new docstring with 4-space indent
         new_docstring = format_docstring(doc, "    ")
-        
+
         if has_docstring:
             # Replace existing docstring (line after class definition to docstring_end_line)
             # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
@@ -233,14 +228,14 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
             # class_line is 1-indexed, so lines[class_line-1] is the class line
             # Insert at position class_line (which is right after the class line)
             lines = lines[:class_line] + [new_docstring] + lines[class_line:]
-        
+
         updated = True
         print(f"Updated docstring for {class_name} in {filepath}")
-    
+
     if updated:
         with open(filepath, "w", encoding="utf-8", newline="\n") as f:
             f.writelines(lines)
-    
+
     return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
 
 
@@ -250,25 +245,25 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
     """
     if path is None:
         path = DIFFUSERS_PATH
-    
+
     if os.path.isfile(path):
         all_files = [path]
     else:
         all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
-    
+
     all_markers = []
-    
+
     for filepath in all_files:
         markers = process_file(filepath, overwrite)
         all_markers.extend(markers)
-    
+
     if not overwrite and len(all_markers) > 0:
         message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
         raise ValueError(
             f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
             f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
         )
-    
+
     if overwrite and len(all_markers) > 0:
         print(f"\nUpdated {len(all_markers)} docstring(s).")
     elif len(all_markers) == 0:
@@ -279,18 +274,13 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
     parser = argparse.ArgumentParser(
         description="Check and fix # auto_docstring markers in modular pipeline blocks",
     )
-    parser.add_argument(
-        "path",
-        nargs="?",
-        default=None,
-        help="File or directory to process (default: src/diffusers)"
-    )
+    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
     parser.add_argument(
         "--fix_and_overwrite",
         action="store_true",
         help="Whether to fix the docstrings by inserting them from doc property.",
     )
-    
+
     args = parser.parse_args()
-    
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file
+
+    check_auto_docstrings(args.path, args.fix_and_overwrite)

From 507953f4156349d4d96cc6a8e0e7aa8eeefcf47e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:19:14 +0100
Subject: [PATCH 09/23] more more

---
 .../qwenimage/modular_blocks_qwenimage.py     | 168 +++++++++++++++---
 .../modular_blocks_qwenimage_edit.py          | 118 +++++++++++-
 .../modular_blocks_qwenimage_edit_plus.py     | 102 ++++++++++-
 .../modular_blocks_qwenimage_layered.py       | 165 +++++++++++++++--
 4 files changed, 503 insertions(+), 50 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index d54dca5f5ad6..7f18de4f99dd 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -77,8 +77,11 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -257,7 +260,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -454,8 +458,7 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -500,8 +503,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -591,8 +593,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -691,8 +692,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -789,8 +789,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -898,8 +897,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -1016,8 +1014,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -1235,8 +1232,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-      overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
@@ -1298,8 +1294,140 @@ def description(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageAutoBlocks
+
+      Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+      - for image-to-image generation, you need to provide `image`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` 
+      - to run the controlnet workflow, you need to provide `control_image`
+      - for text-to-image generation, all you need to provide is `prompt`
+
+      Components:
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          control_image_processor (`VaeImageProcessor`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+
+          image (`Image`, *optional*):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_image_latents (`None`, *optional*):
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     block_classes = AUTO_BLOCKS.values()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 37a438ea1f54..91efe9dda2bf 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -76,10 +76,11 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -423,8 +424,7 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -733,8 +733,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
@@ -802,8 +801,109 @@ def outputs(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageEditAutoBlocks
+
+      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+      - for edit (img2img) generation, you need to provide `image`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          height (`int`):
+              The height in pixels of the generated image.
+
+          width (`int`):
+              The width in pixels of the generated image.
+
+          image_latents (`None`):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 851b69f232e7..3a780daf9602 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -69,10 +69,11 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -129,8 +130,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -396,8 +397,95 @@ def description(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusAutoBlocks
+
+      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+      - `image` is required input (can be single image or list of images).
+      - Each image is resized independently based on its own aspect ratio.
+      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
     block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 56fa1345a5ce..7cb5cd7a1ca3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -55,8 +55,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
@@ -73,23 +72,28 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,11 +103,16 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -390,8 +399,136 @@ def outputs(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredAutoBlocks
+
+      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          image_caption_prompt_en (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
+    1. Write the caption using natural, descriptive language without structured formats or rich text.
+    2. Enrich caption details by including:
+     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+    3. Maintain authenticity and accuracy:
+     - Avoid generalizations
+     - Describe all visible information in the image, while do not add information not explicitly shown in the image
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          image_caption_prompt_cn (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
+    2. 通过加入以下内容，丰富图注细节：
+     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
+     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
+     - 环境细节：例如天气、光照、颜色、纹理、气氛等
+     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
+    3. 保持真实性与准确性：
+     - 不要使用笼统的描述
+     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          prompt (`str`, *optional*):
+              The prompt to encode
+
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-layered"
     block_classes = LAYERED_AUTO_BLOCKS.values()
     block_names = LAYERED_AUTO_BLOCKS.keys()

From 1c90ce33f2445b29c1967976a1734db97f5eaa3a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:21:26 +0100
Subject: [PATCH 10/23] up

---
 .../qwenimage/modular_blocks_qwenimage.py     | 47 +++++++------
 .../modular_blocks_qwenimage_edit.py          | 29 ++++----
 .../modular_blocks_qwenimage_edit_plus.py     | 24 +++----
 .../modular_blocks_qwenimage_layered.py       | 69 +++++++------------
 4 files changed, 79 insertions(+), 90 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 7f18de4f99dd..85b77c2a6b93 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -77,11 +77,8 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -260,8 +257,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations.
-      This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -458,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -503,7 +500,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -593,7 +591,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -692,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -789,7 +789,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -897,7 +898,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -1014,7 +1016,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -1232,7 +1235,8 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+      overally to the original image.
 
       Components:
 
@@ -1294,6 +1298,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
@@ -1301,7 +1306,7 @@ class QwenImageAutoBlocks
 
       Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
       - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` 
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
       - to run the controlnet workflow, you need to provide `control_image`
       - for text-to-image generation, all you need to provide is `prompt`
 
@@ -1332,11 +1337,8 @@ class QwenImageAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1428,6 +1430,7 @@ class QwenImageAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     block_classes = AUTO_BLOCKS.values()
@@ -1438,7 +1441,7 @@ def description(self):
         return (
             "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
             + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
             + "- to run the controlnet workflow, you need to provide `control_image`\n"
             + "- for text-to-image generation, all you need to provide is `prompt`"
         )
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 91efe9dda2bf..3fcbc8853f48 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -76,11 +76,10 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -424,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -733,7 +733,8 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+      overlay to the original image.
 
       Components:
 
@@ -801,6 +802,7 @@ def outputs(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
@@ -808,7 +810,8 @@ class QwenImageEditAutoBlocks
 
       Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
 
@@ -835,11 +838,10 @@ class QwenImageEditAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -904,6 +906,7 @@ class QwenImageEditAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 3a780daf9602..0364e394d29d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -130,8 +129,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations.
-      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
+      on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -397,6 +396,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     """
@@ -430,11 +430,10 @@ class QwenImageEditPlusAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -486,6 +485,7 @@ class QwenImageEditPlusAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
     block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7cb5cd7a1ca3..5602fc9b93e5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -55,7 +55,8 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+      provided.
 
       Components:
 
@@ -72,28 +73,23 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -103,16 +99,11 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -399,6 +390,7 @@ def outputs(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     """
@@ -431,28 +423,23 @@ class QwenImageLayeredAutoBlocks
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -462,16 +449,11 @@ class QwenImageLayeredAutoBlocks
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -529,6 +511,7 @@ class QwenImageLayeredAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-layered"
     block_classes = LAYERED_AUTO_BLOCKS.values()
     block_names = LAYERED_AUTO_BLOCKS.keys()

From aea0d046f6eb759dca55a11bd9c55f89db39b3e4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 17 Jan 2026 09:36:58 +0100
Subject: [PATCH 11/23] address feedbacks

---
 .../modular_pipeline_utils.py                 |   4 +-
 .../qwenimage/modular_blocks_qwenimage.py     | 408 ++++--------------
 .../modular_blocks_qwenimage_edit.py          | 256 +++--------
 .../modular_blocks_qwenimage_edit_plus.py     | 147 ++-----
 .../modular_blocks_qwenimage_layered.py       | 190 +++-----
 utils/modular_auto_docstring.py               |  16 +-
 6 files changed, 271 insertions(+), 750 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index fab7c7193e5d..368fbbcbd138 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -711,7 +711,7 @@ def wrap_text(text, indent, max_length):
 
         formatted_params.append(param_str)
 
-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)
 
 
 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
         loading_field_values = []
         for field_name in component.loading_fields():
             field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                 loading_field_values.append(f"{field_name}={field_value}")
 
         # Add loading field information if available
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 85b77c2a6b93..3bd4ae56832a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -62,50 +62,44 @@
 # auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageAutoTextEncoderStep
-
-      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -130,48 +124,36 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for inpainting tasks. It:
+    This step is used for processing image and mask inputs for inpainting tasks. It:
        - Resizes the image to the target size, based on `height` and `width`.
        - Processes and updates `image` and `mask_image`.
        - Creates `image_latents`.
 
       Components:
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -193,34 +175,26 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgVaeEncoderStep
-
-      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -255,36 +229,30 @@ def description(self):
 # auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageOptionalControlNetVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
@@ -312,46 +280,32 @@ def description(self):
 # auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgInputStep
-
-      Input step that prepares the inputs for the img2img denoising step. It:
+    Input step that prepares the inputs for the img2img denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -370,48 +324,33 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintInputStep
-
-      Input step that prepares the inputs for the inpainting denoising step. It:
+    Input step that prepares the inputs for the inpainting denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -436,44 +375,32 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -498,60 +425,43 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -589,67 +499,47 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -689,65 +579,46 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -787,74 +658,53 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           control_image_latents (`None`):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -896,81 +746,57 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1014,79 +840,56 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1196,26 +999,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1233,29 +1031,22 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-      overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1302,131 +1093,102 @@ def description(self):
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageAutoBlocks
-
-      Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
       - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
       - to run the controlnet workflow, you need to provide `control_image`
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           image (`Image`, *optional*):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_image_latents (`None`, *optional*):
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 3fcbc8853f48..627cfce6ee7b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -59,55 +59,46 @@
 # auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVLEncoderStep
-
-      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -133,33 +124,26 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -181,47 +165,36 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
        - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
        - process the resized image and mask image.
        - create image latents.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -270,48 +243,34 @@ def description(self):
 # auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInputStep
-
-      Input step that prepares the inputs for the edit denoising step. It:
+    Input step that prepares the inputs for the edit denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -335,50 +294,35 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintInputStep
-
-      Input step that prepares the inputs for the edit inpaint denoising step. It:
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -405,44 +349,32 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -464,61 +396,44 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -556,66 +471,47 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit inpaint task.
+    Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -694,26 +590,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -731,29 +622,22 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -806,103 +690,81 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`):
               The height in pixels of the generated image.
-
           width (`int`):
               The width in pixels of the generated image.
-
           image_latents (`None`):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 0364e394d29d..cc07fc1e6a75 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -52,57 +52,48 @@
 # auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVLEncoderStep
-
-      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_cond_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -127,34 +118,27 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVaeEncoderStep
-
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -184,9 +168,7 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusInputStep
-
-      Input step that prepares the inputs for the Edit Plus denoising step. It:
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
        - Standardizes text embeddings batch size.
        - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
        - Outputs lists of image_height/image_width for RoPE calculation.
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`List`):
               The image heights calculated from the image latents dimension
-
           image_width (`List`):
               The image widths calculated from the image latents dimension
     """
@@ -254,61 +224,44 @@ def description(self):
 # auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -350,26 +303,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusDecodeStep
-
-      Decode step that decodes the latents to images and postprocesses the generated image.
+    Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -400,88 +348,73 @@ def description(self):
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
       - `image` is required input (can be single image or list of images).
       - Each image is resized independently based on its own aspect ratio.
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 5602fc9b93e5..7cbc174871b5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,43 +53,45 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredTextEncoderStep
-
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -169,36 +165,28 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,48 +214,34 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredInputStep
-
-      Input step that prepares the inputs for the layered denoising step. It:
+    Input step that prepares the inputs for the layered denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
-
           height (`int`):
               The height of the image output
-
           width (`int`):
               The width of the image output
     """
@@ -292,58 +266,42 @@ def description(self):
 # auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Layered img2img task.
+    Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -394,52 +352,55 @@ def outputs(self):
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredAutoBlocks
-
-      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index e2d523b2f378..01d984a58430 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
     return classes_to_update
 
 
+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
 def format_docstring(doc: str, indent: str = "    ") -> str:
     """Format a doc string as a properly indented docstring."""
     lines = doc.strip().split("\n")
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
             print(f"Warning: Could not get doc for {class_name} in {filepath}")
             continue
 
+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
         # Format the new docstring with 4-space indent
         new_docstring = format_docstring(doc, "    ")
 
@@ -283,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file

From 25c968a38f991b020d12604eedb4efda1d016dee Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 17 Jan 2026 09:57:56 +0100
Subject: [PATCH 12/23] add TODO in the description for empty docstring

---
 .../modular_pipeline_utils.py                 |  2 +
 .../modular_pipelines/qwenimage/encoders.py   |  3 +-
 .../qwenimage/modular_blocks_qwenimage.py     | 97 +++++++++++++++----
 .../modular_blocks_qwenimage_edit.py          | 59 ++++++++---
 .../modular_blocks_qwenimage_edit_plus.py     | 29 ++++--
 .../modular_blocks_qwenimage_layered.py       | 78 +++++++--------
 utils/modular_auto_docstring.py               |  2 +-
 7 files changed, 184 insertions(+), 86 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 368fbbcbd138..45556c538ab8 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -708,6 +708,8 @@ def wrap_text(text, indent, max_length):
             desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
             wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
             param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."
 
         formatted_params.append(param_str)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index f0dd6471b168..8d7b1905423d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1324,7 +1324,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
+            InputParam.template(self._image_input_name)
+            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
             InputParam.generator(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 3bd4ae56832a..645c01f66ee5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -425,7 +440,8 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           height (`int`, *optional*):
@@ -499,7 +519,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -579,7 +606,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -658,7 +691,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -746,7 +785,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -840,7 +887,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -1031,7 +1085,8 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
 
       Components:
 
@@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
           control_image_latents (`None`, *optional*):
+              TODO: Add description.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 627cfce6ee7b..0bfbb921c9c4 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -622,7 +650,8 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
 
       Components:
 
@@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
 
@@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           width (`int`):
               The width in pixels of the generated image.
           image_latents (`None`):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index cc07fc1e6a75..8dab6fbcf95d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7cbc174871b5..544b1abfc3ed 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,7 +53,8 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
 
       Components:
 
@@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           layers (`int`, *optional*, defaults to 4):
@@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index 01d984a58430..7bb2c87e81da 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -297,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file
+    check_auto_docstrings(args.path, args.fix_and_overwrite)

From de03d7f1005777cc3bfdf9107bb8b775311fce8d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sun, 18 Jan 2026 00:35:01 +0100
Subject: [PATCH 13/23] refactor based on dhruv's feedback: remove the class
 method

---
 .../modular_pipeline_utils.py                 | 343 ++++++++----------
 1 file changed, 147 insertions(+), 196 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 45556c538ab8..f8dde1fbd096 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -324,6 +324,133 @@ class ConfigSpec:
     description: Optional[str] = None
 
 
+# ======================================================
+# InputParam and OutputParam templates
+# ======================================================
+
+INPUT_PARAM_TEMPLATES = {
+    "prompt": {
+        "type_hint": str,
+        "required": True,
+        "description": "The prompt or prompts to guide image generation.",
+    },
+    "negative_prompt": {
+        "type_hint": str,
+        "default": None,
+        "description": "The prompt or prompts not to guide the image generation.",
+    },
+    "max_sequence_length": {
+        "type_hint": int,
+        "default": 512,
+        "description": "Maximum sequence length for prompt encoding.",
+    },
+    "height": {
+        "type_hint": int,
+        "description": "The height in pixels of the generated image.",
+    },
+    "width": {
+        "type_hint": int,
+        "description": "The width in pixels of the generated image.",
+    },
+    "num_inference_steps": {
+        "type_hint": int,
+        "default": 50,
+        "description": "The number of denoising steps.",
+    },
+    "num_images_per_prompt": {
+        "type_hint": int,
+        "default": 1,
+        "description": "The number of images to generate per prompt.",
+    },
+    "generator": {
+        "type_hint": torch.Generator,
+        "default": None,
+        "description": "Torch generator for deterministic generation.",
+    },
+    "sigmas": {
+        "type_hint": List[float],
+        "default": None,
+        "description": "Custom sigmas for the denoising process.",
+    },
+    "strength": {
+        "type_hint": float,
+        "default": 0.9,
+        "description": "Strength for img2img/inpainting.",
+    },
+    "image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Input image for img2img, editing, or conditioning.",
+    },
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "default": None,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "default": None,
+        "description": "Pre-generated noisy latents for image generation.",
+    },
+    "timesteps": {
+        "type_hint": torch.Tensor,
+        "default": None,
+        "description": "Timesteps for the denoising process.",
+    },
+    "output_type": {
+        "type_hint": str,
+        "default": "pil",
+        "description": "Output format: 'pil', 'np', 'pt'.",
+    },
+    "attention_kwargs": {
+        "type_hint": Dict[str, Any],
+        "default": None,
+        "description": "Additional kwargs for attention processors.",
+    },
+    "denoiser_input_fields": {
+        "kwargs_type": "denoiser_input_fields",
+        "type_hint": torch.Tensor,
+        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+    },
+    "control_guidance_start": {
+        "type_hint": float,
+        "default": 0.0,
+        "description": "When to start applying ControlNet.",
+    },
+    "control_guidance_end": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "When to stop applying ControlNet.",
+    },
+    "controlnet_conditioning_scale": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "Scale for ControlNet conditioning.",
+    },
+}
+
+OUTPUT_PARAM_TEMPLATES = {
+    "images": {
+        "type_hint": List[PIL.Image.Image],
+        "description": "Generated images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Denoised latents.",
+    },
+}
+
+
 # YiYi Notes: both inputs and intermediate_inputs are InputParam objects
 # however some fields are not relevant for intermediate_inputs
 # e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
@@ -344,190 +471,22 @@ def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str) -> Optional["InputParam"]:
-        """Get template for name if exists, otherwise None."""
-        if hasattr(cls, name) and callable(getattr(cls, name)):
-            return getattr(cls, name)()
-        return None
-
-    # ======================================================
-    # InputParam templates
-    # ======================================================
-
-    @classmethod
-    def prompt(cls) -> "InputParam":
-        return cls(
-            name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation."
-        )
-
-    @classmethod
-    def negative_prompt(cls) -> "InputParam":
-        return cls(
-            name="negative_prompt",
-            type_hint=str,
-            default=None,
-            description="The prompt or prompts not to guide the image generation.",
-        )
-
-    @classmethod
-    def max_sequence_length(cls, default: int = 512) -> "InputParam":
-        return cls(
-            name="max_sequence_length",
-            type_hint=int,
-            default=default,
-            description="Maximum sequence length for prompt encoding.",
-        )
-
-    @classmethod
-    def height(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(
-            name="height", type_hint=int, default=default, description="The height in pixels of the generated image."
-        )
-
-    @classmethod
-    def width(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(
-            name="width", type_hint=int, default=default, description="The width in pixels of the generated image."
-        )
-
-    @classmethod
-    def num_inference_steps(cls, default: int = 50) -> "InputParam":
-        return cls(
-            name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps."
-        )
-
-    @classmethod
-    def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
-        return cls(
-            name="num_images_per_prompt",
-            type_hint=int,
-            default=default,
-            description="The number of images to generate per prompt.",
-        )
-
-    @classmethod
-    def generator(cls) -> "InputParam":
-        return cls(
-            name="generator",
-            type_hint=torch.Generator,
-            default=None,
-            description="Torch generator for deterministic generation.",
-        )
-
-    @classmethod
-    def sigmas(cls) -> "InputParam":
-        return cls(
-            name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process."
-        )
-
-    @classmethod
-    def strength(cls, default: float = 0.9) -> "InputParam":
-        return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.")
-
-    # images
-    @classmethod
-    def image(cls) -> "InputParam":
-        return cls(
-            name="image",
-            type_hint=PIL.Image.Image,
-            required=True,
-            description="Input image for img2img, editing, or conditioning.",
-        )
-
-    @classmethod
-    def mask_image(cls) -> "InputParam":
-        return cls(
-            name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting."
-        )
-
-    @classmethod
-    def control_image(cls) -> "InputParam":
-        return cls(
-            name="control_image",
-            type_hint=PIL.Image.Image,
-            required=True,
-            description="Control image for ControlNet conditioning.",
-        )
-
-    @classmethod
-    def padding_mask_crop(cls) -> "InputParam":
-        return cls(
-            name="padding_mask_crop",
-            type_hint=int,
-            default=None,
-            description="Padding for mask cropping in inpainting.",
-        )
-
-    @classmethod
-    def latents(cls) -> "InputParam":
-        return cls(
-            name="latents",
-            type_hint=torch.Tensor,
-            default=None,
-            description="Pre-generated noisy latents for image generation.",
-        )
-
-    @classmethod
-    def timesteps(cls) -> "InputParam":
-        return cls(
-            name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process."
-        )
-
-    @classmethod
-    def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.")
-
-    @classmethod
-    def attention_kwargs(cls) -> "InputParam":
-        return cls(
-            name="attention_kwargs",
-            type_hint=Dict[str, Any],
-            default=None,
-            description="Additional kwargs for attention processors.",
-        )
-
-    @classmethod
-    def denoiser_input_fields(cls) -> "InputParam":
-        return cls(
-            kwargs_type="denoiser_input_fields",
-            type_hint=torch.Tensor,
-            description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-        )
-
-    # ControlNet
-    @classmethod
-    def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
-        return cls(
-            name="control_guidance_start",
-            type_hint=float,
-            default=default,
-            description="When to start applying ControlNet.",
-        )
-
-    @classmethod
-    def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
-        return cls(
-            name="control_guidance_end",
-            type_hint=float,
-            default=default,
-            description="When to stop applying ControlNet.",
-        )
-
-    @classmethod
-    def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
-        return cls(
-            name="controlnet_conditioning_scale",
-            type_hint=float,
-            default=default,
-            description="Scale for ControlNet conditioning.",
-        )
+    def template(cls, name: str,  **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise return basic InputParam with just the name."""
+        if name in INPUT_PARAM_TEMPLATES:
+            kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]}
+            # Override with user-provided values
+            for key, value in overrides.items():
+                kwargs[key] = value
+            return cls(**kwargs)
+        return cls(name=name, **overrides)
 
 
 @dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str
+    name: str = None
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
@@ -538,23 +497,15 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str) -> Optional["OutputParam"]:
-        """Get template for name if exists, otherwise None."""
-        if hasattr(cls, name) and callable(getattr(cls, name)):
-            return getattr(cls, name)()
-        return None
-
-    # ======================================================
-    # OutputParam templates
-    # ======================================================
-
-    @classmethod
-    def images(cls) -> "OutputParam":
-        return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.")
-
-    @classmethod
-    def latents(cls) -> "OutputParam":
-        return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.")
+    def template(cls, name: str, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise return basic OutputParam with just the name."""
+        if name in OUTPUT_PARAM_TEMPLATES:
+            kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]}
+            # Override with user-provided values
+            for key, value in overrides.items():
+                kwargs[key] = value
+            return cls(**kwargs)
+        return cls(name=name, **overrides)
 
 
 def format_inputs_short(inputs):
@@ -890,4 +841,4 @@ def make_doc_string(
     output += "\n\n"
     output += format_output_params(outputs, indent_level=2)
 
-    return output
+    return output
\ No newline at end of file

From 002c3e8239b267e17b3849d1e53fde78890f0ad1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 03:24:34 +0100
Subject: [PATCH 14/23] add template method

---
 .../modular_pipeline_utils.py                 | 163 ++++++++++++------
 1 file changed, 112 insertions(+), 51 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index f8dde1fbd096..a65aa43b2a3b 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -336,7 +336,6 @@ class ConfigSpec:
     },
     "negative_prompt": {
         "type_hint": str,
-        "default": None,
         "description": "The prompt or prompts not to guide the image generation.",
     },
     "max_sequence_length": {
@@ -364,12 +363,10 @@ class ConfigSpec:
     },
     "generator": {
         "type_hint": torch.Generator,
-        "default": None,
         "description": "Torch generator for deterministic generation.",
     },
     "sigmas": {
         "type_hint": List[float],
-        "default": None,
         "description": "Custom sigmas for the denoising process.",
     },
     "strength": {
@@ -378,33 +375,16 @@ class ConfigSpec:
         "description": "Strength for img2img/inpainting.",
     },
     "image": {
-        "type_hint": PIL.Image.Image,
+        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
         "required": True,
-        "description": "Input image for img2img, editing, or conditioning.",
-    },
-    "mask_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Mask image for inpainting.",
-    },
-    "control_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Control image for ControlNet conditioning.",
-    },
-    "padding_mask_crop": {
-        "type_hint": int,
-        "default": None,
-        "description": "Padding for mask cropping in inpainting.",
+        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
     },
     "latents": {
         "type_hint": torch.Tensor,
-        "default": None,
         "description": "Pre-generated noisy latents for image generation.",
     },
     "timesteps": {
         "type_hint": torch.Tensor,
-        "default": None,
         "description": "Timesteps for the denoising process.",
     },
     "output_type": {
@@ -414,14 +394,28 @@ class ConfigSpec:
     },
     "attention_kwargs": {
         "type_hint": Dict[str, Any],
-        "default": None,
         "description": "Additional kwargs for attention processors.",
     },
     "denoiser_input_fields": {
         "kwargs_type": "denoiser_input_fields",
-        "type_hint": torch.Tensor,
         "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
     },
+    # inpainting
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    # controlnet
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
     "control_guidance_start": {
         "type_hint": float,
         "default": 0.0,
@@ -437,6 +431,45 @@ class ConfigSpec:
         "default": 1.0,
         "description": "Scale for ControlNet conditioning.",
     },
+    "layers": {
+        "type_hint": int,
+        "default": 4,
+        "description": "Number of layers to extract from the image",
+    },
+    # common intermediate inputs
+    "prompt_embeds":{
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
+    },
+    "batch_size": {
+        "type_hint": int,
+        "default": 1,
+        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+    },
+    "dtype": {
+        "type_hint": torch.dtype,
+        "default": torch.float32,
+        "description": "The dtype of the model inputs, can be generated in input step.",
+    },
 }
 
 OUTPUT_PARAM_TEMPLATES = {
@@ -448,15 +481,34 @@ class ConfigSpec:
         "type_hint": torch.Tensor,
         "description": "Denoised latents.",
     },
+    # intermediate outputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The prompt embeddings.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The encoder attention mask.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings mask.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "description": "The latent representation of the input image.",
+    },
 }
 
 
-# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
-# however some fields are not relevant for intermediate_inputs
-# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
-# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
-# -> should we use different class for inputs and intermediate_inputs?
-@dataclass
 class InputParam:
     """Specification for an input parameter."""
 
@@ -465,31 +517,37 @@ class InputParam:
     default: Any = None
     required: bool = False
     description: str = ""
-    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)
+    kwargs_type: str = None
+
+    def __post_init__(self):
+        if self.required and self.default is not None:
+            raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value")
 
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str,  **overrides) -> "InputParam":
-        """Get template for name if exists, otherwise return basic InputParam with just the name."""
-        if name in INPUT_PARAM_TEMPLATES:
-            kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]}
-            # Override with user-provided values
-            for key, value in overrides.items():
-                kwargs[key] = value
-            return cls(**kwargs)
-        return cls(name=name, **overrides)
+    def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {name} not found")
+
+        template_kwargs = INPUT_PARAM_TEMPLATES[name].copy()
+        
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+        
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
 
 
-@dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
     name: str = None
     type_hint: Any = None
     description: str = ""
-    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
+    kwargs_type: str = None
 
     def __repr__(self):
         return (
@@ -497,15 +555,18 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str, **overrides) -> "OutputParam":
-        """Get template for name if exists, otherwise return basic OutputParam with just the name."""
-        if name in OUTPUT_PARAM_TEMPLATES:
-            kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]}
-            # Override with user-provided values
-            for key, value in overrides.items():
-                kwargs[key] = value
-            return cls(**kwargs)
-        return cls(name=name, **overrides)
+    def template(cls, name: str, note: str = None, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {name} not found")
+        
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy()
+        
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+        
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
 
 
 def format_inputs_short(inputs):

From 1f2dbc9dd2bf4d256039120f6d6ccaf49f1c09c7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 04:10:17 +0100
Subject: [PATCH 15/23] up

---
 .../qwenimage/before_denoise.py               | 187 +++----
 .../modular_pipelines/qwenimage/decoders.py   |  71 +--
 .../modular_pipelines/qwenimage/denoise.py    | 125 +----
 .../modular_pipelines/qwenimage/encoders.py   | 509 ++++++++----------
 .../modular_pipelines/qwenimage/inputs.py     | 282 +++++++---
 .../qwenimage/modular_blocks_qwenimage.py     |  61 ++-
 .../modular_blocks_qwenimage_edit.py          |  39 +-
 .../modular_blocks_qwenimage_edit_plus.py     |  30 +-
 .../modular_blocks_qwenimage_layered.py       |  73 ++-
 9 files changed, 677 insertions(+), 700 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index cb808b1d3807..b87c3555aad3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -134,28 +134,20 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.latents(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.num_images_per_prompt(),
-            InputParam.generator(),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -225,31 +217,21 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.latents(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam(
-                name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"
-            ),
-            InputParam.num_images_per_prompt(),
-            InputParam.generator(),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("layers"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -325,18 +307,8 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised, can be generated in prepare latent step.",
             ),
-            InputParam(
-                name="image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
-            ),
-            InputParam(
-                name="timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."),
         ]
 
     @property
@@ -347,6 +319,11 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised used for inpainting denoising.",
             ),
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The scalednoisy latents to use for inpainting/image-to-image denoising.",
+            ),
         ]
 
     @staticmethod
@@ -406,9 +383,9 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The processed mask to use for the inpainting process.",
             ),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="dtype", required=True),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("dtype"),
         ]
 
     @property
@@ -468,14 +445,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
-            ),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."),
         ]
 
     @property
@@ -484,6 +456,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
             ),
+            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -534,15 +507,16 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam("image_latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor),
+            OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
+            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     @torch.no_grad()
@@ -592,15 +566,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
-            ),
-            InputParam.strength(0.9),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."),
+            InputParam.template("strength", default=0.9),
         ]
 
     @property
@@ -609,7 +578,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps",
                 type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+                description="The timesteps to use for the denoising process.",
+            ),
+            OutputParam(
+                name="num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time",
             ),
         ]
 
@@ -668,11 +642,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("height", note="should be updated in prepare latents step."),
+            InputParam.template("width", note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -734,13 +708,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True),
-            InputParam(name="image_width", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
+            InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -813,13 +787,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."),
+            InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -887,12 +861,12 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="layers", default=4, description="Number of layers to extract from the image"),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("layers"),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -973,16 +947,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.control_guidance_start(),
-            InputParam.control_guidance_end(),
-            InputParam.controlnet_conditioning_scale(),
-            InputParam("control_image_latents", required=True),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("control_guidance_start"),
+            InputParam.template("control_guidance_end"),
+            InputParam.template("controlnet_conditioning_scale"),
+            InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 8207e99b69ae..499f0172888b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -47,14 +47,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
+            InputParam.template("height", required=True, note="should be updated in input and prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in input and prepare latents step."),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, 1, H, W"),
         ]
 
     @torch.no_grad()
@@ -86,10 +87,16 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("height", required=True, type_hint=int),
-            InputParam("width", required=True, type_hint=int),
-            InputParam("layers", default=4, description="Number of layers to extract from the image"),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("layers"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
         ]
 
     @torch.no_grad()
@@ -128,17 +135,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
-        return [OutputParam.images()]
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images", note="tensor output of the vae decoder.")]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -190,19 +192,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
-            InputParam.output_type(),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam.template("output_type"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
 
     @torch.no_grad()
@@ -269,10 +266,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam.output_type(),
+            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam.template("output_type"),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type):
         if output_type not in ["pil", "np", "pt"]:
@@ -314,11 +315,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam.output_type(),
-            InputParam("mask_overlay_kwargs"),
+            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam.template("output_type"),
+            InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type, mask_overlay_kwargs):
         if output_type not in ["pil", "np", "pt"]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 472945b2269a..49fde3fd6ac3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -49,12 +49,7 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
-            ),
+            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
         ]
 
     @torch.no_grad()
@@ -79,18 +74,8 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
-            ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
-            ),
+            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
+            InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."),
         ]
 
     @torch.no_grad()
@@ -134,30 +119,10 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
             ),
-            InputParam(
-                "controlnet_conditioning_scale",
-                type_hint=float,
-                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "controlnet_keep",
-                required=True,
-                type_hint=List[float],
-                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description=(
-                    "All conditional model inputs for the denoiser. "
-                    "It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens."
-                ),
-            ),
+            InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."),
+            InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."),
+            InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields")
         ]
 
     @torch.no_grad()
@@ -218,25 +183,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.attention_kwargs(),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam.denoiser_input_fields(),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
                 type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
             ),
         ]
 
@@ -319,20 +274,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.attention_kwargs(),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam.denoiser_input_fields(),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -418,7 +363,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -459,24 +404,14 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."),
         ]
 
     @torch.no_grad()
@@ -517,18 +452,8 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     @property
     def loop_inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
         ]
 
     @torch.no_grad()
@@ -560,6 +485,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 
 # Qwen Image (text2image, image2image)
+
+# auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
 
@@ -584,6 +511,7 @@ def description(self) -> str:
 
 
 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -609,6 +537,7 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image, image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -634,6 +563,7 @@ def description(self) -> str:
 
 
 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -667,6 +597,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image)
+# auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-edit"
     block_classes = [
@@ -690,6 +621,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (inpainting)
+# auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-edit"
     block_classes = [
@@ -715,6 +647,7 @@ def description(self) -> str:
 
 
 # Qwen Image Layered (image2image)
+# auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-layered"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 8d7b1905423d..82a3b6811959 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -259,33 +259,30 @@ def encode_vae_image(
 # ====================
 # 1. RESIZE
 # ====================
+# In QwenImage pipelines, resize is a separate step because the resized image is used in VL encoding and vae encoder blocks:
+#
+#   image (PIL.Image.Image)
+#       │
+#       ▼
+#   resized_image ([PIL.Image.Image])
+#       │
+#       ├──► text_encoder ──► prompt_embeds, prompt_embeds_mask
+#       │    (VL encoding needs the resized image for vision-language fusion)
+#       │
+#       └──► image_processor ──► processed_image (torch.Tensor, pixel space)
+#                │
+#                ▼
+#            vae_encoder ──► image_latents (torch.Tensor, latent space)
+#
+# In most of our other pipelines, resizing is done as part of the image preprocessing step.
+# ====================
 class QwenImageEditResizeStep(ModularPipelineBlocks):
     model_name = "qwenimage-edit"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
 
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to target area while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -300,21 +297,15 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name,
-                required=True,
-                type_hint=torch.Tensor,
-                description="Input image for conditioning",
-            ),
-        ]
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image", 
+                type_hint=List[PIL.Image.Image], 
+                description="The resized images",
             ),
         ]
 
@@ -322,7 +313,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -338,7 +329,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
@@ -346,30 +337,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     model_name = "qwenimage-layered"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -385,10 +355,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
-            ),
+            InputParam.template("image"),
             InputParam(
                 name="resolution",
                 default=640,
@@ -399,11 +366,11 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
-            ),
-        ]
+        return [OutputParam(
+            name="resized_image", 
+            type_hint=List[PIL.Image.Image], 
+            description="The resized images",
+        )]
 
     @staticmethod
     def check_inputs(resolution: int):
@@ -416,7 +383,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
         self.check_inputs(resolution=block_state.resolution)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -433,45 +400,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
 
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
-    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""
 
     model_name = "qwenimage-edit-plus"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
-    ):
-        """Create a step for resizing images to a target area.
-
-        Each image is resized independently based on its own aspect ratio. This is suitable for Edit Plus where
-        multiple reference images can have different dimensions.
-
-        Args:
-            input_name (str, optional): Name of the image field to read. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        self._target_area = target_area
-        super().__init__()
-
     @property
     def description(self) -> str:
         return (
-            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
+            "Resize images for QwenImage Edit Plus pipeline.\n"
+            "Produces two outputs: resized_image (1024x1024) for VAE encoding, "
+            "resized_cond_image (384x384) for VL text encoding.\n"
             "Each image is resized independently based on its own aspect ratio."
         )
 
@@ -488,21 +431,21 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name,
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image(s) to resize",
-            ),
-        ]
+        # image
+        return [InputParam.template("image")] 
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 1024x1024 target area for VAE encoding",
+            ),
+            OutputParam(
+                name="resized_cond_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 384x384 target area for VL text encoding",
             ),
         ]
 
@@ -510,7 +453,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -520,16 +463,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
         # Resize each image independently based on its own aspect ratio
         resized_images = []
+        resized_cond_images = []
         for image in images:
             image_width, image_height = image.size
-            calculated_width, calculated_height, _ = calculate_dimensions(
-                self._target_area, image_width / image_height
-            )
+            
+            # For VAE encoder (1024x1024 target area)
+            vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
             resized_images.append(
-                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
+                components.image_resize_processor.resize(image, height=vae_height, width=vae_width)
+            )
+            
+            # For VL text encoder (384x384 target area)
+            vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
+            resized_cond_images.append(
+                components.image_resize_processor.resize(image, height=vl_height, width=vl_width)
             )
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
+        block_state.resized_cond_image = resized_cond_images
         self.set_block_state(state, block_state)
         return components, state
 
@@ -538,13 +489,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 2. GET IMAGE PROMPT
 # ====================
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
-    """
-    Auto-caption step that generates a text prompt from the input image if none is provided. Uses the VL model to
-    generate a description of the image.
-    """
 
     model_name = "qwenimage-layered"
 
+    def __init__(self):
+        self.image_caption_prompt_en = QWENIMAGE_LAYERED_CAPTION_PROMPT_EN
+        self.image_caption_prompt_cn = QWENIMAGE_LAYERED_CAPTION_PROMPT_CN
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -560,19 +512,10 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec("processor", Qwen2VLProcessor),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="image_caption_prompt_en", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_EN),
-            ConfigSpec(name="image_caption_prompt_cn", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_CN),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name="prompt", type_hint=str, description="The prompt to encode"
-            ),  # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -596,9 +539,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         # If prompt is empty or None, generate caption from image
         if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ":
             if block_state.use_en_prompt:
-                caption_prompt = components.config.image_caption_prompt_en
+                caption_prompt = self.image_caption_prompt_en
             else:
-                caption_prompt = components.config.image_caption_prompt_cn
+                caption_prompt = self.image_caption_prompt_cn
 
             model_inputs = components.processor(
                 text=caption_prompt,
@@ -627,6 +570,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_PROMPT_TEMPLATE_START_IDX
+        self.tokenizer_max_length = 1024
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that generates text embeddings to guide the image generation."
@@ -644,49 +593,22 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_PROMPT_TEMPLATE_START_IDX),
-            ConfigSpec(name="tokenizer_max_length", default=1024),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
-            InputParam.max_sequence_length(1024),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
+            InputParam.template("max_sequence_length", default=1024),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -715,9 +637,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.text_encoder,
             components.tokenizer,
             prompt=block_state.prompt,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-            tokenizer_max_length=components.config.tokenizer_max_length,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+            tokenizer_max_length=self.tokenizer_max_length,
             device=device,
         )
 
@@ -732,9 +654,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                 components.text_encoder,
                 components.tokenizer,
                 prompt=negative_prompt,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-                tokenizer_max_length=components.config.tokenizer_max_length,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+                tokenizer_max_length=self.tokenizer_max_length,
                 device=device,
             )
             block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[
@@ -751,6 +673,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation."
@@ -768,18 +695,12 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_image",
                 required=True,
@@ -791,30 +712,10 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -842,8 +743,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -856,8 +757,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                 components.processor,
                 prompt=negative_prompt,
                 image=block_state.resized_image,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                 device=device,
             )
 
@@ -866,10 +767,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
 
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
-    """Text encoder for QwenImage Edit Plus (VL encoding with multiple images)."""
 
     model_name = "qwenimage-edit-plus"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE
+        self.img_template_encode = QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -890,19 +796,12 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE),
-            ConfigSpec(name="img_template_encode", default=QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_cond_image",
                 required=True,
@@ -914,30 +813,10 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -965,9 +844,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_cond_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            img_template_encode=components.config.img_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            img_template_encode=self.img_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -981,9 +860,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                     components.processor,
                     prompt=negative_prompt,
                     image=block_state.resized_cond_image,
-                    prompt_template_encode=components.config.prompt_template_encode,
-                    img_template_encode=components.config.img_template_encode,
-                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                    prompt_template_encode=self.prompt_template_encode,
+                    img_template_encode=self.img_template_encode,
+                    prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                     device=device,
                 )
             )
@@ -1016,18 +895,26 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.mask_image(),
-            InputParam.image(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.padding_mask_crop(),
+            InputParam.template("mask_image"),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            ),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1088,21 +975,29 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.mask_image(),
+            InputParam.template("mask_image"),
             InputParam(
-                "resized_image",
+                name="resized_image",
                 required=True,
                 type_hint=PIL.Image.Image,
                 description="The resized image. should be generated using a resize step",
             ),
-            InputParam.padding_mask_crop(),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(
+                name="processed_image", 
+                type_hint=torch.Tensor, 
+                description="The processed image"
+            ),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1151,14 +1046,18 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.image(),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @staticmethod
     def check_inputs(height, width, vae_scale_factor):
@@ -1209,12 +1108,21 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("resized_image", required=True),
+            InputParam(
+                name="resized_image", 
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            ),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1252,11 +1160,20 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("resized_image")]
+        return [InputParam(
+            name="resized_image",
+            required=True,
+            type_hint=List[PIL.Image.Image],
+            description="The resized image. should be generated using a resize step",
+        )]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1274,7 +1191,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             processed_images.append(
                 components.image_processor.preprocess(image=img, height=img_height, width=img_width)
             )
-        block_state.processed_image = processed_images
+
         if is_image_list:
             block_state.processed_image = processed_images
         else:
@@ -1294,8 +1211,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        input_name: str = "processed_image",
-        output_name: str = "image_latents",
+        input: Optional[InputParam] = None,
+        output: Optional[OutputParam] = None,
     ):
         """Initialize a VAE encoder step for converting images to latent representations.
 
@@ -1303,11 +1220,24 @@ def __init__(
         a single tensor, outputs a single latent tensor.
 
         Args:
-            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
-            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
+            input (InputParam, optional): Input parameter for the processed image. Defaults to "processed_image".
+            output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
         """
-        self._image_input_name = input_name
-        self._image_latents_output_name = output_name
+        if input is None:
+            input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode")
+
+        if output is None:
+            output = OutputParam.template("image_latents")
+
+        if not isinstance(input, InputParam):
+            raise ValueError(f"input must be InputParam but is {type(input)}")
+        if not isinstance(output, OutputParam):
+            raise ValueError(f"output must be OutputParam but is {type(output)}")
+
+        self._input = input
+        self._output = output
+        self._image_input_name = input.name
+        self._image_latents_output_name = output.name
         super().__init__()
 
     @property
@@ -1324,20 +1254,13 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name)
-            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
-            InputParam.generator(),
+            self._input, # default is "processed_image"
+            InputParam.template("generator"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                self._image_latents_output_name,
-                type_hint=torch.Tensor,
-                description="The latents representing the reference image(s). Single tensor or list depending on input.",
-            )
-        ]
+        return [self._output] # default is "image_latents"
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1398,10 +1321,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.control_image(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.generator(),
+            InputParam.template("control_image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("generator"),
         ]
         return inputs
 
@@ -1489,22 +1412,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 6. PERMUTE LATENTS
 # ====================
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
-    """Permute image latents from VAE format to Layered format."""
-
     model_name = "qwenimage-layered"
 
-    def __init__(self, input_name: str = "image_latents"):
-        self._input_name = input_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Permute {self._input_name} from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
+        return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(self._input_name, required=True),
+            InputParam.template("image_latents"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("image_latents", note="permuted from [B, C, 1, H, W] to [B, 1, C, H, W]"),
         ]
 
     @torch.no_grad()
@@ -1512,8 +1435,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Permute: (B, C, 1, H, W) -> (B, 1, C, H, W)
-        latents = getattr(block_state, self._input_name)
-        setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4))
+        latents = block_state.image_latents
+        block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
 
         self.set_block_state(state, block_state)
-        return components, state
+        return components, state
\ No newline at end of file
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index e28493ecc369..bd2f79ae7c4c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 import torch
 
@@ -129,26 +129,22 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
+            OutputParam.template("batch_size"),
+            OutputParam.template("dtype"),
+            OutputParam.template("prompt_embeds", note="batch-expanded"),
+            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
         ]
 
     @staticmethod
@@ -228,13 +224,28 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
+    ):   
+        # by default, process `image_latents`
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -252,9 +263,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -269,23 +280,19 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -295,14 +302,42 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 name="image_width",
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
-            ),
+            )
         ]
 
+        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided"))
+
+        # image latent inputs are modified in place (patchified and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -331,7 +366,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -356,13 +392,27 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -381,9 +431,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -398,23 +448,20 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=List[int],
@@ -426,12 +473,40 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="The image widths calculated from the image latents dimension",
             ),
         ]
+        
+        # `height`/`width` are updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+
+        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -476,7 +551,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -494,8 +570,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-# YiYi TODO: support define config default component from the ModularPipeline level.
-# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
+# same as QwenImageAdditionalInputsStep, but with layered pachifier.
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
     """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
 
@@ -503,13 +578,27 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -527,9 +616,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -544,21 +633,18 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
         ]
+        # default is `image_latents`
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -569,15 +655,40 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
             ),
-            OutputParam(name="height", type_hint=int, description="The height of the image output"),
-            OutputParam(name="width", type_hint=int, description="The width of the image output"),
         ]
 
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+
+        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
+                )
+            )
+
+        # Add outputs for additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -608,7 +719,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -636,11 +748,19 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="control_image_latents", required=True),
-            InputParam(name="batch_size", required=True),
-            InputParam.num_images_per_prompt(),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam.template("batch_size"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+        ]
+    
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
+            OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 645c01f66ee5..42593a93f98a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,8 +75,11 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -400,8 +403,7 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -440,8 +442,7 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -478,7 +479,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -519,8 +520,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -606,8 +606,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -648,7 +647,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -691,8 +690,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -742,6 +740,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -785,8 +785,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -842,6 +841,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -887,8 +888,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -942,6 +942,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -1065,7 +1067,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -1085,8 +1087,7 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-    overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
@@ -1098,7 +1099,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
@@ -1182,8 +1183,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1228,7 +1232,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           image_latents (`None`, *optional*):
               TODO: Add description.
@@ -1244,8 +1248,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 0bfbb921c9c4..46e8881b9521 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,10 +74,11 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -376,8 +377,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -452,7 +452,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -536,7 +536,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -630,7 +630,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -650,8 +650,7 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-    overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
@@ -663,7 +662,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
@@ -722,8 +721,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
 
@@ -750,10 +748,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -790,10 +789,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 8dab6fbcf95d..1fb967bf1322 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,10 +67,11 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -99,7 +100,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusTextEncoderStep(),
     ]
     block_names = ["resize", "encode"]
@@ -145,7 +146,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusProcessImagesInputStep(),
         QwenImageVaeEncoderStep(),
     ]
@@ -268,7 +269,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -325,7 +326,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -386,10 +387,11 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -418,10 +420,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 544b1abfc3ed..7d6c2ea0635a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,8 +53,7 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-    provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
@@ -71,23 +70,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -97,11 +101,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -300,7 +309,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -381,23 +390,28 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -407,11 +421,16 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -444,10 +463,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):

From fb15752d5538c4e4ec95d8164630cbc374002405 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 08:10:31 +0100
Subject: [PATCH 16/23] up up up

---
 .../modular_pipeline_utils.py                 | 35 +++++---
 .../qwenimage/before_denoise.py               | 79 +++++++++++++------
 .../modular_pipelines/qwenimage/decoders.py   | 63 ++++++++++++---
 .../modular_pipelines/qwenimage/denoise.py    | 63 +++++++++++----
 .../modular_pipelines/qwenimage/encoders.py   |  2 +-
 .../modular_pipelines/qwenimage/inputs.py     | 20 ++---
 .../qwenimage/modular_blocks_qwenimage.py     | 24 +++---
 .../modular_blocks_qwenimage_edit.py          | 20 ++---
 .../modular_blocks_qwenimage_edit_plus.py     | 10 +--
 .../modular_blocks_qwenimage_layered.py       |  8 +-
 10 files changed, 216 insertions(+), 108 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index a65aa43b2a3b..5ef1b98f1ba3 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -397,6 +397,7 @@ class ConfigSpec:
         "description": "Additional kwargs for attention processors.",
     },
     "denoiser_input_fields": {
+        "name": None,
         "kwargs_type": "denoiser_input_fields",
         "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
     },
@@ -509,6 +510,7 @@ class ConfigSpec:
 }
 
 
+@dataclass
 class InputParam:
     """Specification for an input parameter."""
 
@@ -519,20 +521,22 @@ class InputParam:
     description: str = ""
     kwargs_type: str = None
 
-    def __post_init__(self):
-        if self.required and self.default is not None:
-            raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value")
-
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
+    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
         """Get template for name if exists, otherwise raise ValueError."""
-        if name not in INPUT_PARAM_TEMPLATES:
-            raise ValueError(f"InputParam template for {name} not found")
+        if template_name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {template_name} not found")
 
-        template_kwargs = INPUT_PARAM_TEMPLATES[name].copy()
+        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
+        
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
         
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
@@ -541,6 +545,7 @@ def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
         return cls(name=name, **template_kwargs)
 
 
+@dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
@@ -555,12 +560,18 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str, note: str = None, **overrides) -> "OutputParam":
+    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
         """Get template for name if exists, otherwise raise ValueError."""
-        if name not in OUTPUT_PARAM_TEMPLATES:
-            raise ValueError(f"OutputParam template for {name} not found")
+        if template_name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {template_name} not found")
+        
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
         
-        template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy()
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
         
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index b87c3555aad3..fc795b5f5a2f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -146,8 +146,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -230,8 +230,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -307,8 +307,13 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised, can be generated in prepare latent step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
-            InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @property
@@ -322,7 +327,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
-                description="The scalednoisy latents to use for inpainting/image-to-image denoising.",
+                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
             ),
         ]
 
@@ -383,8 +388,8 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The processed mask to use for the inpainting process.",
             ),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("dtype"),
         ]
 
@@ -447,7 +452,12 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."),
+            InputParam(
+                name="latents", 
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step."
+            ),
         ]
 
     @property
@@ -456,7 +466,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
             ),
-            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -515,8 +524,11 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
-            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
+            OutputParam(
+                name="timesteps", 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process."
+            ),
         ]
 
     @torch.no_grad()
@@ -568,7 +580,12 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."),
+            InputParam(
+                "latents", 
+                required=True, 
+                type_hint=torch.Tensor,
+                description="The latents to use for the denoising process. Can be generated in prepare latents step."
+            ),
             InputParam.template("strength", default=0.9),
         ]
 
@@ -583,7 +600,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="num_inference_steps",
                 type_hint=int,
-                description="The number of denoising steps to perform at inference time",
+                description="The number of denoising steps to perform at inference time. Updated based on strength.",
             ),
         ]
 
@@ -643,8 +660,8 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam.template("height", note="should be updated in prepare latents step."),
-            InputParam.template("width", note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -711,8 +728,8 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("batch_size"),
             InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
             InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -788,10 +805,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."),
+            InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."),
             InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -863,8 +880,8 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
             InputParam.template("layers"),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -950,8 +967,18 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("control_guidance_start"),
             InputParam.template("control_guidance_end"),
             InputParam.template("controlnet_conditioning_scale"),
-            InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
-            InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."),
+            InputParam(
+                name="control_image_latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."
+            ),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 499f0172888b..4476e1db9bad 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Any, Dict, List
 
 import torch
 
@@ -47,15 +47,24 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("height", required=True, note="should be updated in input and prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in input and prepare latents step."),
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to decode, can be generated in the denoise step."
+            ),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.template("latents", note="unpacked to B, C, 1, H, W"),
+            OutputParam(
+                name="latents", 
+                type_hint=torch.Tensor, 
+                description="The denoisedlatents unpacked to B, C, 1, H, W"
+            ),
         ]
 
     @torch.no_grad()
@@ -87,9 +96,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step."
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("layers"),
         ]
 
@@ -135,7 +149,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+            ),
         ]
 
     @property
@@ -192,7 +211,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+            ),
             InputParam.template("output_type"),
         ]
 
@@ -266,7 +290,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam(
+                name="images", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="the generated image tensor from decoders step"
+            ),
             InputParam.template("output_type"),
         ]
 
@@ -315,9 +344,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam(
+                name="images", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="the generated image tensor from decoders step"
+            ),
             InputParam.template("output_type"),
-            InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
+            InputParam(
+                name="mask_overlay_kwargs", 
+                type_hint=Dict[str, Any],
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 49fde3fd6ac3..ad6a9677aca3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -49,7 +49,12 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+            ),
         ]
 
     @torch.no_grad()
@@ -74,8 +79,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
-            InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+            ),
+            InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."),
         ]
 
     @torch.no_grad()
@@ -119,10 +129,13 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
             ),
-            InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."),
-            InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."),
-            InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."),
-            InputParam.template("denoiser_input_fields")
+            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
+            InputParam(
+                name="controlnet_keep", 
+                required=True, 
+                type_hint=List[float], 
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step."
+            ),
         ]
 
     @torch.no_grad()
@@ -184,8 +197,13 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
+            ),
+            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -275,8 +293,13 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
+            ),
+            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -404,14 +427,19 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                "timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @torch.no_grad()
@@ -452,8 +480,13 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     @property
     def loop_inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
+            InputParam.template("num_inference_steps", required=True),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 82a3b6811959..9a83f0d7178a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1145,7 +1145,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep."
+        return "Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index bd2f79ae7c4c..b237031b91d2 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -139,8 +139,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.template("batch_size"),
-            OutputParam.template("dtype"),
+            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
             OutputParam.template("prompt_embeds", note="batch-expanded"),
             OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
             OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
@@ -307,8 +307,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
 
         # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # image latent inputs are modified in place (patchified and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -476,8 +476,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
         
         # `height`/`width` are updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -658,8 +658,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
         ]
 
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -759,8 +759,8 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
-            OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
+            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 42593a93f98a..46f0b6f6ff5a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageControlNetBeforeDenoiserStep,
     QwenImageCreateMaskLatentsStep,
@@ -319,7 +319,7 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
 
     model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
     block_names = ["text_inputs", "additional_inputs"]
 
     @property
@@ -373,7 +373,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -512,7 +512,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -598,7 +598,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -682,7 +682,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -777,7 +777,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -880,7 +880,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -981,7 +981,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -1042,7 +1042,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -1279,5 +1279,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 46e8881b9521..158763ce917a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from typing import Optional
+import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageCreateMaskLatentsStep,
     QwenImageEditRoPEInputsStep,
@@ -206,7 +207,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageEditResizeStep(),
         QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
+        QwenImageVaeEncoderStep(),
     ]
     block_names = ["resize", "preprocess", "encode"]
 
@@ -286,7 +287,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -344,8 +345,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+        QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -485,7 +485,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -571,7 +571,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -605,7 +605,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -698,7 +698,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -816,5 +816,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 1fb967bf1322..a16dee1c7595 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageEditPlusRoPEInputsStep,
     QwenImagePrepareLatentsStep,
@@ -211,7 +211,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageEditPlusAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -302,7 +302,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -446,5 +446,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7d6c2ea0635a..2471750f2e0b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -255,7 +255,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageLayeredAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -342,7 +342,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -484,5 +484,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]

From 8d45ff5bf60a804a5eaf05933f028e2ddf9772f6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:22:04 +0100
Subject: [PATCH 17/23] apply auto docstring

---
 .../modular_pipeline_utils.py                 |   4 +-
 .../qwenimage/before_denoise.py               | 312 ++++++++++++-
 .../modular_pipelines/qwenimage/decoders.py   | 112 +++++
 .../modular_pipelines/qwenimage/denoise.py    | 295 +++++++++++-
 .../modular_pipelines/qwenimage/encoders.py   | 323 +++++++++++++-
 .../modular_pipelines/qwenimage/inputs.py     | 181 +++++++-
 .../qwenimage/modular_blocks_qwenimage.py     | 421 ++++++++----------
 .../modular_blocks_qwenimage_edit.py          | 273 ++++++------
 .../modular_blocks_qwenimage_edit_plus.py     | 150 +++----
 .../modular_blocks_qwenimage_layered.py       | 216 +++------
 10 files changed, 1616 insertions(+), 671 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 5ef1b98f1ba3..6f1010daf219 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -898,12 +898,12 @@ def make_doc_string(
 
     # Add components section if provided
     if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2)
+        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
         output += components_str + "\n\n"
 
     # Add configs section if provided
     if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2)
+        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
         output += configs_str + "\n\n"
 
     # Add inputs section
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index fc795b5f5a2f..0b8cd0f4b2d2 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -117,8 +117,39 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # 1. PREPARE LATENTS
 # ====================
 
-
+# auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise for the generation process
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
     model_name = "qwenimage"
 
     @property
@@ -201,7 +232,41 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -285,7 +350,29 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+    """
     model_name = "qwenimage"
 
     @property
@@ -366,7 +453,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
+    """
+    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage"
 
     @property
@@ -433,8 +541,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 2. SET TIMESTEPS
 # ====================
 
-
+# auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process
+    """
     model_name = "qwenimage"
 
     @property
@@ -500,7 +626,27 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and packed in input step.)
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -562,7 +708,30 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         return components, state
 
 
+# auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The latents to use for the denoising process. Can be generated in prepare latents step.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time. Updated based on strength.
+    """
     model_name = "qwenimage"
 
     @property
@@ -646,8 +815,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 ## RoPE inputs for denoiser
 
-
+# auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage"
 
     @property
@@ -715,7 +908,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_height (`int`):
+              The height of the reference image. Can be generated in input step.
+          image_width (`int`):
+              The width of the reference image. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage"
 
     @property
@@ -790,7 +1012,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
+      Should be placed after prepare_latents step.
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_height (`List`):
+              The heights of the reference images. Can be generated in input step.
+          image_width (`List`):
+              The widths of the reference images. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -866,7 +1119,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+          additional_t_cond (`Tensor`):
+              The additional t cond, used for RoPE calculation
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -948,7 +1230,31 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 ## ControlNet inputs for denoiser
+
+# auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
+    """
+    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
+
+      Components:
+          controlnet (`QwenImageControlNetModel`)
+
+      Inputs:
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          controlnet_keep (`List`):
+              The controlnet keep values
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 4476e1db9bad..650bf34da7a3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -29,7 +29,27 @@
 
 
 # after denoising loop (unpack latents)
+
+#auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step.
+
+      Outputs:
+          latents (`Tensor`):
+              The denoisedlatents unpacked to B, C, 1, H, W
+    """
     model_name = "qwenimage"
 
     @property
@@ -80,7 +100,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+#auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents. (unpacked to B, C, layers+1, H, W)
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -131,7 +172,23 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 # decode step
+
+#auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
+    """
+    Step that decodes the latents to images
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
     model_name = "qwenimage"
 
     @property
@@ -189,7 +246,25 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+#auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
+    """
+    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -269,7 +344,25 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 # postprocess the decoded images
+
+#auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     @property
@@ -323,7 +416,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+#auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image, optional apply the mask overally to the original image..
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index ad6a9677aca3..ff6e411d7632 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -85,7 +85,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor, 
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
             ),
-            InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."),
+            InputParam.template("image_latents"),
         ]
 
     @torch.no_grad()
@@ -197,13 +197,6 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
-            ),
-            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -293,13 +286,6 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
-            ),
-            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -427,19 +413,19 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."),
+            InputParam.template("image_latents"),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
-            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -521,6 +507,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2image and image2image tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
 
     block_classes = [
@@ -546,6 +564,45 @@ def description(self) -> str:
 # Qwen Image (inpainting)
 # auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -572,6 +629,46 @@ def description(self) -> str:
 # Qwen Image (text2image, image2image) with controlnet
 # auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2img/img2img tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          controlnet (`QwenImageControlNetModel`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -598,6 +695,53 @@ def description(self) -> str:
 # Qwen Image (inpainting) with controlnet
 # auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          controlnet (`QwenImageControlNetModel`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -632,6 +776,40 @@ def description(self) -> str:
 # Qwen Image Edit (image2image)
 # auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -656,6 +834,45 @@ def description(self) -> str:
 # Qwen Image Edit (inpainting)
 # auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -682,6 +899,40 @@ def description(self) -> str:
 # Qwen Image Layered (image2image)
 # auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Layered.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 9a83f0d7178a..083ee507ccbb 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -276,7 +276,23 @@ def encode_vae_image(
 #
 # In most of our other pipelines, resizing is done as part of the image preprocessing step.
 # ====================
+
+# auto_docstring
 class QwenImageEditResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to target area while maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
     model_name = "qwenimage-edit"
 
 
@@ -334,7 +350,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -405,7 +438,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
+    """
+    Resize images for QwenImage Edit Plus pipeline.
+      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
+      Each image is resized independently based on its own aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -488,7 +540,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 2. GET IMAGE PROMPT
 # ====================
+
+# auto_docstring
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
+    """
+    Auto-caption step that generates a text prompt from the input image if none is provided.
+      Uses the VL model (text_encoder) to generate a description of the image.
+      If prompt is already provided, this step passes through unchanged.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          resized_image (`Image`):
+              The image to generate caption from, should be resized use the resize step
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+      Outputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
+    """
 
     model_name = "qwenimage-layered"
 
@@ -530,6 +605,16 @@ def inputs(self) -> List[InputParam]:
             ),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt",
+                type_hint=str,
+                description="The prompt or prompts to guide image generation. If not provided, updated using image caption",
+            ),
+        ]
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
@@ -567,7 +652,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # ====================
 # 3. TEXT ENCODER
 # ====================
+
+# auto_docstring
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that generates text embeddings to guide the image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
     model_name = "qwenimage"
 
     def __init__(self):
@@ -670,7 +783,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_image (`Image`):
+              The image prompt to encode, should be resized using resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
     model_name = "qwenimage"
 
     def __init__(self):
@@ -766,7 +906,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_cond_image (`Tensor`):
+              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -874,7 +1041,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 4. IMAGE PREPROCESS
 # ====================
+
+# auto_docstring
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
     model_name = "qwenimage"
 
     @property
@@ -954,7 +1149,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          resized_image (`Image`):
+              The resized image. should be generated using a resize step
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
     model_name = "qwenimage-edit"
 
     @property
@@ -1025,7 +1243,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. will resize the image to the given height and width.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage"
 
     @property
@@ -1087,7 +1324,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images needs to be resized first.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage-edit"
 
     @property
@@ -1140,7 +1392,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1204,8 +1471,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 5. VAE ENCODER
 # ====================
+
+# auto_docstring
 class QwenImageVaeEncoderStep(ModularPipelineBlocks):
-    """VAE encoder that handles both single images and lists of images with varied resolutions."""
+    """
+    VAE Encoder step that converts processed_image into latent representations image_latents.
+      Handles both single images and lists of images with varied resolutions.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          processed_image (`Tensor`):
+              The image tensor to encode
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
 
     model_name = "qwenimage"
 
@@ -1297,7 +1582,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
+    """
+    VAE Encoder step that converts `control_image` into latent representations control_image_latents.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+          controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          control_image (`Image`):
+              Control image for ControlNet conditioning.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
     model_name = "qwenimage"
 
     @property
@@ -1411,7 +1719,20 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # ====================
 # 6. PERMUTE LATENTS
 # ====================
+
+# auto_docstring
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
+    """
+    Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing.
+
+      Inputs:
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
+    """
     model_name = "qwenimage-layered"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index b237031b91d2..0e03242e5e49 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -109,7 +109,42 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
     return height, width
 
 
+# auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
+    """
+    Text input processing step that standardizes text embeddings for the pipeline.
+      This step:
+        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
+        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
+
+      This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+    """
     model_name = "qwenimage"
 
     @property
@@ -217,8 +252,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
+    """
+    Input processing step that:
+        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
 
     model_name = "qwenimage"
 
@@ -385,8 +459,48 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+    """
+    Input processing step for Edit Plus that:
+        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+        Height/width defaults to last image in the list.
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -571,8 +685,44 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 
 # same as QwenImageAdditionalInputsStep, but with layered pachifier.
+
+# auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
+    """
+    Input processing step for Layered that:
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
+              pachifier and batch-expanded)
+    """
 
     model_name = "qwenimage-layered"
 
@@ -738,7 +888,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
+    """
+    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
+
+      Inputs:
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The control image latents (patchified and batch-expanded).
+          height (`int`):
+              if not provided, updated to control image height
+          width (`int`):
+              if not provided, updated to control image width
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 46f0b6f6ff5a..b50e41bb5079 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -65,26 +65,10 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
@@ -95,13 +79,13 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
 
       Outputs:
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage"
@@ -130,16 +114,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - Creates `image_latents`.
 
       Components:
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -150,14 +132,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          processed_image (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage"
@@ -180,14 +162,12 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -196,10 +176,10 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage"
@@ -238,11 +218,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
        - if `control_image` is not provided, step will be skipped.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           controlnet (`QwenImageControlNetModel`)
-
           control_image_processor (`VaeImageProcessor`)
 
       Inputs:
@@ -286,36 +263,50 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     Input step that prepares the inputs for the img2img denoising step. It:
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
     """
 
     model_name = "qwenimage"
@@ -335,38 +326,54 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     Input step that prepares the inputs for the inpainting denoising step. It:
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
     """
 
     model_name = "qwenimage"
@@ -394,30 +401,31 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-          height (`None`):
-              TODO: Add description.
-          width (`None`):
-              TODO: Add description.
-          dtype (`None`):
-              TODO: Add description.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
 
       Outputs:
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -445,26 +453,22 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           height (`int`, *optional*):
@@ -479,7 +483,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -523,34 +527,30 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -609,32 +609,28 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -647,7 +643,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -693,30 +689,25 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -735,12 +726,9 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -788,38 +776,33 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -836,12 +819,9 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -891,36 +871,31 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -937,12 +912,9 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -1058,20 +1030,18 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage"
@@ -1090,22 +1060,20 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage"
@@ -1157,42 +1125,18 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
-
           controlnet (`QwenImageControlNetModel`)
-
           control_image_processor (`VaeImageProcessor`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
@@ -1202,8 +1146,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-          image (`Image`, *optional*):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -1216,14 +1160,14 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Control image for ControlNet conditioning.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -1232,29 +1176,26 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          control_image_latents (`None`, *optional*):
-              TODO: Add description.
+          control_image_latents (`Tensor`, *optional*):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 158763ce917a..0c1fa00842e5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -63,29 +63,14 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -95,13 +80,13 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-edit"
@@ -128,26 +113,23 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit"
@@ -173,16 +155,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - create image latents.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           mask_image (`Image`):
               Mask image for inpainting.
           padding_mask_crop (`int`, *optional*):
@@ -193,14 +172,14 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit"
@@ -252,36 +231,50 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
     """
 
     model_name = "qwenimage-edit"
@@ -308,38 +301,54 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
     """
 
     model_name = "qwenimage-edit"
@@ -368,30 +377,31 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-          height (`None`):
-              TODO: Add description.
-          width (`None`):
-              TODO: Add description.
-          dtype (`None`):
-              TODO: Add description.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
 
       Outputs:
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -416,32 +426,28 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -452,7 +458,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -496,34 +502,30 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -536,7 +538,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -621,20 +623,18 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit"
@@ -653,22 +653,20 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit"
@@ -724,41 +722,20 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -775,10 +752,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               The height in pixels of the generated image.
           width (`int`):
               The width in pixels of the generated image.
-          image_latents (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -789,12 +766,12 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index a16dee1c7595..726c000f4b38 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -55,47 +55,32 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
           resized_cond_image (`List`):
-              The resized images
+              Images resized to 384x384 target area for VL text encoding
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-edit-plus"
@@ -122,26 +107,25 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
       Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
           resized_image (`List`):
-              The resized images
-          processed_image (`None`):
-              TODO: Add description.
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit-plus"
@@ -176,36 +160,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
        - Defaults height/width from last image in the list.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`List`):
               The image heights calculated from the image latents dimension
           image_width (`List`):
               The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
     """
 
     model_name = "qwenimage-edit-plus"
@@ -233,32 +231,28 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -269,7 +263,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -317,20 +311,18 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit-plus"
@@ -365,41 +357,19 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -420,7 +390,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 2471750f2e0b..37a06e9af254 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -56,73 +56,19 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
-    1. Write the caption using natural, descriptive language without structured formats or rich text.
-    2. Enrich caption details by including:
-     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
-     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
-    3. Maintain authenticity and accuracy:
-     - Avoid generalizations
-     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
-    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
-    2. 通过加入以下内容，丰富图注细节：
-     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
-     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
-     - 环境细节：例如天气、光照、颜色、纹理、气氛等
-     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
-    3. 保持真实性与准确性：
-     - 不要使用笼统的描述
-     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           prompt (`str`, *optional*):
-              The prompt to encode
+              The prompt or prompts to guide image generation.
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
           negative_prompt (`str`, *optional*):
@@ -133,14 +79,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-layered"
@@ -168,16 +116,13 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           generator (`Generator`, *optional*):
@@ -186,10 +131,10 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-layered"
@@ -220,36 +165,46 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImageLayeredPachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
           height (`int`):
-              The height of the image output
+              if not provided, updated to image height
           width (`int`):
-              The width of the image output
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
+              pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -275,28 +230,24 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
-
           pachifier (`QwenImageLayeredPachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           layers (`int`, *optional*, defaults to 4):
@@ -309,7 +260,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -366,83 +317,24 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           pachifier (`QwenImageLayeredPachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
-    1. Write the caption using natural, descriptive language without structured formats or rich text.
-    2. Enrich caption details by including:
-     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
-     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
-    3. Maintain authenticity and accuracy:
-     - Avoid generalizations
-     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
-    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
-    2. 通过加入以下内容，丰富图注细节：
-     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
-     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
-     - 环境细节：例如天气、光照、颜色、纹理、气氛等
-     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
-    3. 保持真实性与准确性：
-     - 不要使用笼统的描述
-     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           prompt (`str`, *optional*):
-              The prompt to encode
+              The prompt or prompts to guide image generation.
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
           negative_prompt (`str`, *optional*):
@@ -463,7 +355,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.

From f056af1fbb24b79c6cc5360ea782abacd63c34fd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:27:40 +0100
Subject: [PATCH 18/23] make style

---
 .../modular_pipeline_utils.py                 |  18 +-
 .../qwenimage/before_denoise.py               | 133 ++++++++-----
 .../modular_pipelines/qwenimage/decoders.py   |  93 +++++----
 .../modular_pipelines/qwenimage/denoise.py    | 123 ++++++------
 .../modular_pipelines/qwenimage/encoders.py   | 177 ++++++++++--------
 .../modular_pipelines/qwenimage/inputs.py     |  91 ++++++---
 .../qwenimage/modular_blocks_qwenimage.py     | 136 +++++++-------
 .../modular_blocks_qwenimage_edit.py          |  81 ++++----
 .../modular_blocks_qwenimage_edit_plus.py     |  37 ++--
 .../modular_blocks_qwenimage_layered.py       |  40 ++--
 10 files changed, 497 insertions(+), 432 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 6f1010daf219..a57212988e28 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -438,7 +438,7 @@ class ConfigSpec:
         "description": "Number of layers to extract from the image",
     },
     # common intermediate inputs
-    "prompt_embeds":{
+    "prompt_embeds": {
         "type_hint": torch.Tensor,
         "required": True,
         "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
@@ -531,16 +531,16 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
             raise ValueError(f"InputParam template for {template_name} not found")
 
         template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
-        
+
         # Determine the actual param name:
         # 1. From overrides if provided
         # 2. From template if present
         # 3. Fall back to template_name
         name = overrides.pop("name", template_kwargs.pop("name", template_name))
-        
+
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-        
+
         template_kwargs.update(overrides)
         return cls(name=name, **template_kwargs)
 
@@ -564,18 +564,18 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "OutputP
         """Get template for name if exists, otherwise raise ValueError."""
         if template_name not in OUTPUT_PARAM_TEMPLATES:
             raise ValueError(f"OutputParam template for {template_name} not found")
-        
+
         template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
-        
+
         # Determine the actual param name:
         # 1. From overrides if provided
         # 2. From template if present
         # 3. Fall back to template_name
         name = overrides.pop("name", template_kwargs.pop("name", template_name))
-        
+
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-        
+
         template_kwargs.update(overrides)
         return cls(name=name, **template_kwargs)
 
@@ -913,4 +913,4 @@ def make_doc_string(
     output += "\n\n"
     output += format_output_params(outputs, indent_level=2)
 
-    return output
\ No newline at end of file
+    return output
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 0b8cd0f4b2d2..418d927f4faa 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -117,6 +117,7 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # 1. PREPARE LATENTS
 # ====================
 
+
 # auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
     """
@@ -137,8 +138,8 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           dtype (`dtype`, *optional*, defaults to torch.float32):
               The dtype of the model inputs, can be generated in input step.
 
@@ -150,6 +151,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial latents to use for the denoising process
     """
+
     model_name = "qwenimage"
 
     @property
@@ -254,8 +256,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           dtype (`dtype`, *optional*, defaults to torch.float32):
               The dtype of the model inputs, can be generated in input step.
 
@@ -267,6 +269,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial latents to use for the denoising process
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -353,7 +356,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
     """
-    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
+    prepare_latents. Both noise and image latents should alreadybe patchified.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -362,8 +366,8 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
@@ -373,6 +377,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The scaled noisy latents to use for inpainting/image-to-image denoising.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -396,10 +401,10 @@ def inputs(self) -> List[InputParam]:
             ),
             InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
         ]
 
@@ -475,6 +480,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -541,10 +547,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 2. SET TIMESTEPS
 # ====================
 
+
 # auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
+    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
+    step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -561,6 +569,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
           timesteps (`Tensor`):
               The timesteps to use for the denoising process
     """
+
     model_name = "qwenimage"
 
     @property
@@ -579,10 +588,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
             InputParam(
-                name="latents", 
+                name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step."
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
             ),
         ]
 
@@ -640,13 +649,14 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and packed in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and packed in input step.)
 
       Outputs:
           timesteps (`Tensor`):
               The timesteps to use for the denoising process.
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -671,9 +681,7 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="timesteps", 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process."
+                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
             ),
         ]
 
@@ -711,7 +719,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 # auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.
+    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
+    prepare latents step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -732,6 +741,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
           num_inference_steps (`int`):
               The number of denoising steps to perform at inference time. Updated based on strength.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -750,10 +760,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
             InputParam(
-                "latents", 
-                required=True, 
+                "latents",
+                required=True,
                 type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare latents step."
+                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
             ),
             InputParam.template("strength", default=0.9),
         ]
@@ -815,6 +825,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 ## RoPE inputs for denoiser
 
+
 # auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
     """
@@ -822,8 +833,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`):
               The height in pixels of the generated image.
           width (`int`):
@@ -841,6 +852,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage"
 
     @property
@@ -911,12 +923,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
     """
-    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
+    prepare_latents step
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_height (`int`):
               The height of the reference image. Can be generated in input step.
           image_width (`int`):
@@ -938,6 +951,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage"
 
     @property
@@ -948,8 +962,18 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
-            InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=int,
+                description="The height of the reference image. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=int,
+                description="The width of the reference image. Can be generated in input step.",
+            ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
@@ -1016,13 +1040,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
     """
     Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
-      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
-      Should be placed after prepare_latents step.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
+      after prepare_latents step.
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_height (`List`):
               The heights of the reference images. Can be generated in input step.
           image_width (`List`):
@@ -1044,6 +1068,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1058,8 +1083,18 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."),
-            InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=List[int],
+                description="The heights of the reference images. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=List[int],
+                description="The widths of the reference images. Can be generated in input step.",
+            ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
@@ -1126,8 +1161,8 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
           height (`int`):
@@ -1149,6 +1184,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
           additional_t_cond (`Tensor`):
               The additional t cond, used for RoPE calculation
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -1231,6 +1267,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 ## ControlNet inputs for denoiser
 
+
 # auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
     """
@@ -1247,7 +1284,8 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
@@ -1255,6 +1293,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
           controlnet_keep (`List`):
               The controlnet keep values
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1274,16 +1313,16 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("control_guidance_end"),
             InputParam.template("controlnet_conditioning_scale"),
             InputParam(
-                name="control_image_latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
             ),
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 650bf34da7a3..1adbf6bdd355 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -30,10 +30,12 @@
 
 # after denoising loop (unpack latents)
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
     """
-    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
+    channels, 1, height, width)
 
       Components:
           pachifier (`QwenImagePachifier`)
@@ -50,6 +52,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The denoisedlatents unpacked to B, C, 1, H, W
     """
+
     model_name = "qwenimage"
 
     @property
@@ -70,10 +73,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to decode, can be generated in the denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step.",
             ),
         ]
 
@@ -81,9 +84,7 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="latents", 
-                type_hint=torch.Tensor, 
-                description="The denoisedlatents unpacked to B, C, 1, H, W"
+                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
             ),
         ]
 
@@ -100,7 +101,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
     """
     Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
@@ -122,6 +123,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
           latents (`Tensor`):
               Denoised latents. (unpacked to B, C, layers+1, H, W)
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -138,10 +140,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step.",
             ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
@@ -173,7 +175,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 # decode step
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
     """
     Step that decodes the latents to images
@@ -183,12 +186,14 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
 
       Outputs:
           images (`List`):
               Generated images. (tensor output of the vae decoder.)
     """
+
     model_name = "qwenimage"
 
     @property
@@ -207,10 +212,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
             ),
         ]
 
@@ -246,18 +251,18 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
     """
     Decode unpacked latents (B, C, layers+1, H, W) into layer images.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -265,6 +270,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -287,10 +293,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
             ),
             InputParam.template("output_type"),
         ]
@@ -345,7 +351,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 # postprocess the decoded images
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
     """
     postprocess the generated image
@@ -363,6 +370,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -384,10 +392,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="images", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="the generated image tensor from decoders step"
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
             ),
             InputParam.template("output_type"),
         ]
@@ -416,7 +424,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
     """
     postprocess the generated image, optional apply the mask overally to the original image..
@@ -430,12 +438,14 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -457,16 +467,17 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="images", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="the generated image tensor from decoders step"
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
             ),
             InputParam.template("output_type"),
             InputParam(
-                name="mask_overlay_kwargs", 
+                name="mask_overlay_kwargs",
                 type_hint=Dict[str, Any],
-                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
+            ),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index ff6e411d7632..3b00fcb274df 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,10 +50,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
         ]
 
@@ -80,10 +80,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
             InputParam.template("image_latents"),
         ]
@@ -131,10 +131,10 @@ def inputs(self) -> List[InputParam]:
             ),
             InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
             InputParam(
-                name="controlnet_keep", 
-                required=True, 
-                type_hint=List[float], 
-                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step."
+                name="controlnet_keep",
+                required=True,
+                type_hint=List[float],
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
             ),
         ]
 
@@ -467,10 +467,10 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     def loop_inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam.template("num_inference_steps", required=True),
         ]
@@ -505,21 +505,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # Qwen Image (text2image, image2image)
 
+
 # auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports text2image and image2image tasks for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -539,6 +539,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
 
     block_classes = [
@@ -551,8 +552,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     @property
     def description(self) -> str:
         return (
-            "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "Denoise step that iteratively denoise the latents.\n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
             " - `QwenImageLoopBeforeDenoiser`\n"
             " - `QwenImageLoopDenoiser`\n"
@@ -565,9 +566,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
@@ -575,9 +576,8 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -603,6 +603,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -630,9 +631,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopBeforeDenoiserControlNet`
        - `QwenImageLoopDenoiser`
@@ -640,10 +641,8 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports text2img/img2img tasks with controlnet for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          controlnet (`QwenImageControlNetModel`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -669,6 +668,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -696,9 +696,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopBeforeDenoiserControlNet`
        - `QwenImageLoopDenoiser`
@@ -707,10 +707,8 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks with controlnet for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          controlnet (`QwenImageControlNetModel`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -742,6 +740,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -777,18 +776,17 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports QwenImage Edit.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -810,6 +808,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -835,9 +834,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
@@ -845,9 +844,8 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks for QwenImage Edit.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -873,6 +871,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -900,18 +899,17 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports QwenImage Layered.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -933,6 +931,7 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 083ee507ccbb..5e1821cca5c0 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -30,7 +30,7 @@
 from ...utils import logging
 from ...utils.torch_utils import unwrap_module
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import QwenImageModularPipeline
 from .prompt_templates import (
     QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE,
@@ -277,6 +277,7 @@ def encode_vae_image(
 # In most of our other pipelines, resizing is done as part of the image preprocessing step.
 # ====================
 
+
 # auto_docstring
 class QwenImageEditResizeStep(ModularPipelineBlocks):
     """
@@ -293,8 +294,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
           resized_image (`List`):
               The resized images
     """
-    model_name = "qwenimage-edit"
 
+    model_name = "qwenimage-edit"
 
     @property
     def description(self) -> str:
@@ -319,8 +320,8 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="resized_image", 
-                type_hint=List[PIL.Image.Image], 
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
                 description="The resized images",
             ),
         ]
@@ -353,7 +354,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     """
-    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.
+    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while
+    maintaining the aspect ratio.
 
       Components:
           image_resize_processor (`VaeImageProcessor`)
@@ -368,11 +370,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
           resized_image (`List`):
               The resized images
     """
+
     model_name = "qwenimage-layered"
 
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -399,11 +402,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="resized_image", 
-            type_hint=List[PIL.Image.Image], 
-            description="The resized images",
-        )]
+        return [
+            OutputParam(
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="The resized images",
+            )
+        ]
 
     @staticmethod
     def check_inputs(resolution: int):
@@ -442,8 +447,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
     """
     Resize images for QwenImage Edit Plus pipeline.
-      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
-      Each image is resized independently based on its own aspect ratio.
+      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text
+      encoding. Each image is resized independently based on its own aspect ratio.
 
       Components:
           image_resize_processor (`VaeImageProcessor`)
@@ -484,7 +489,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         # image
-        return [InputParam.template("image")] 
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -518,13 +523,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         resized_cond_images = []
         for image in images:
             image_width, image_height = image.size
-            
+
             # For VAE encoder (1024x1024 target area)
             vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
-            resized_images.append(
-                components.image_resize_processor.resize(image, height=vae_height, width=vae_width)
-            )
-            
+            resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width))
+
             # For VL text encoder (384x384 target area)
             vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
             resized_cond_images.append(
@@ -541,16 +544,16 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 2. GET IMAGE PROMPT
 # ====================
 
+
 # auto_docstring
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
     """
     Auto-caption step that generates a text prompt from the input image if none is provided.
-      Uses the VL model (text_encoder) to generate a description of the image.
-      If prompt is already provided, this step passes through unchanged.
+      Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step
+      passes through unchanged.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -590,7 +593,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam.template(
+                "prompt", required=False
+            ),  # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -653,15 +658,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 3. TEXT ENCODER
 # ====================
 
+
 # auto_docstring
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     """
     Text Encoder step that generates text embeddings to guide the image generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -681,6 +686,7 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask.
     """
+
     model_name = "qwenimage"
 
     def __init__(self):
@@ -706,7 +712,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -786,12 +791,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
     """
-    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.
+    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image
+    generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -811,6 +816,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask.
     """
+
     model_name = "qwenimage"
 
     def __init__(self):
@@ -835,7 +841,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -909,12 +914,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
     """
-    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.
+    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text
+    embeddings for guiding image generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -922,7 +927,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
           resized_cond_image (`Tensor`):
-              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step
+              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using
+              resize step
 
       Outputs:
           prompt_embeds (`Tensor`):
@@ -963,7 +969,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -1042,10 +1047,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 4. IMAGE PREPROCESS
 # ====================
 
+
 # auto_docstring
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be
+    resized to the given height and width.
 
       Components:
           image_mask_processor (`InpaintProcessor`)
@@ -1070,6 +1077,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1152,7 +1160,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be
+    resized first.
 
       Components:
           image_mask_processor (`InpaintProcessor`)
@@ -1173,6 +1182,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1206,11 +1216,7 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="processed_image", 
-                type_hint=torch.Tensor, 
-                description="The processed image"
-            ),
+            OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"),
             OutputParam(
                 name="processed_mask_image",
                 type_hint=torch.Tensor,
@@ -1263,6 +1269,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1290,11 +1297,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @staticmethod
     def check_inputs(height, width, vae_scale_factor):
@@ -1340,6 +1349,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1361,7 +1371,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="resized_image", 
+                name="resized_image",
                 required=True,
                 type_hint=List[PIL.Image.Image],
                 description="The resized image. should be generated using a resize step",
@@ -1370,11 +1380,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1395,7 +1407,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.
+    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of
+    processed images.
 
       Components:
           image_processor (`VaeImageProcessor`)
@@ -1408,6 +1421,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1427,20 +1441,24 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam(
-            name="resized_image",
-            required=True,
-            type_hint=List[PIL.Image.Image],
-            description="The resized image. should be generated using a resize step",
-        )]
+        return [
+            InputParam(
+                name="resized_image",
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            )
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1472,6 +1490,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 5. VAE ENCODER
 # ====================
 
+
 # auto_docstring
 class QwenImageVaeEncoderStep(ModularPipelineBlocks):
     """
@@ -1509,7 +1528,9 @@ def __init__(
             output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
         """
         if input is None:
-            input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode")
+            input = InputParam(
+                name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode"
+            )
 
         if output is None:
             output = OutputParam.template("image_latents")
@@ -1539,13 +1560,13 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            self._input, # default is "processed_image"
+            self._input,  # default is "processed_image"
             InputParam.template("generator"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [self._output] # default is "image_latents"
+        return [self._output]  # default is "image_latents"
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1588,9 +1609,8 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
     VAE Encoder step that converts `control_image` into latent representations control_image_latents.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           control_image (`Image`):
@@ -1606,6 +1626,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1720,6 +1741,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 6. PERMUTE LATENTS
 # ====================
 
+
 # auto_docstring
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
     """
@@ -1733,11 +1755,12 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
           image_latents (`Tensor`):
               The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
     """
+
     model_name = "qwenimage-layered"
 
     @property
     def description(self) -> str:
-        return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
+        return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
 
     @property
     def inputs(self) -> List[InputParam]:
@@ -1760,4 +1783,4 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
 
         self.set_block_state(state, block_state)
-        return components, state
\ No newline at end of file
+        return components, state
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 0e03242e5e49..818bbca5ed0a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -117,7 +117,8 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
         1. Determines `batch_size` and `dtype` based on `prompt_embeds`
         2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
 
-      This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.
+      This block should be placed after all encoder steps to process the text embeddings before they are used in
+      subsequent pipeline steps.
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -145,6 +146,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask. (batch-expanded)
     """
+
     model_name = "qwenimage"
 
     @property
@@ -271,8 +273,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -300,7 +302,7 @@ def __init__(
         self,
         image_latent_inputs: Optional[List[InputParam]] = None,
         additional_batch_inputs: Optional[List[InputParam]] = None,
-    ):   
+    ):
         # by default, process `image_latents`
         if image_latent_inputs is None:
             image_latent_inputs = [InputParam.template("image_latents")]
@@ -319,7 +321,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -376,13 +380,17 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 name="image_width",
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
-            )
+            ),
         ]
 
         # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # image latent inputs are modified in place (patchified and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -479,8 +487,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -526,7 +534,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -587,11 +597,15 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="The image widths calculated from the image latents dimension",
             ),
         ]
-        
+
         # `height`/`width` are updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -686,11 +700,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # same as QwenImageAdditionalInputsStep, but with layered pachifier.
 
+
 # auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
     """
     Input processing step for Layered that:
-        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
+           size
         2. For additional batch inputs: Expands batch dimensions to match final batch size
 
       Configured inputs:
@@ -705,8 +721,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
 
@@ -720,8 +736,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
           width (`int`):
               if not provided, updated to image width
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
-              pachifier and batch-expanded)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -748,7 +764,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -808,8 +826,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
         ]
 
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -895,10 +917,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
 
       Inputs:
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           height (`int`, *optional*):
@@ -914,6 +937,7 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
           width (`int`):
               if not provided, updated to control image width
     """
+
     model_name = "qwenimage"
 
     @property
@@ -923,17 +947,26 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam(
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
             InputParam.template("batch_size"),
             InputParam.template("num_images_per_prompt"),
             InputParam.template("height"),
             InputParam.template("width"),
         ]
-    
+
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
+            OutputParam(
+                name="control_image_latents",
+                type_hint=torch.Tensor,
+                description="The control image latents (patchified and batch-expanded).",
+            ),
             OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
             OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index b50e41bb5079..5837799d3431 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import torch
+
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageControlNetBeforeDenoiserStep,
     QwenImageCreateMaskLatentsStep,
@@ -65,9 +66,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -114,8 +114,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - Creates `image_latents`.
 
       Components:
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           mask_image (`Image`):
@@ -162,8 +161,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -218,9 +216,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
        - if `control_image` is not provided, step will be skipped.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           control_image (`Image`, *optional*):
@@ -380,7 +377,9 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -401,15 +400,14 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          pachifier (`QwenImagePachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -450,13 +448,12 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -524,13 +521,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -606,13 +602,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -686,14 +681,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -707,7 +700,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`, *optional*):
               mask for the negative text embeddings. Can be generated from text_encoder step.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -773,14 +767,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -802,7 +794,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           processed_mask_image (`Tensor`, *optional*):
               The processed mask image
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -868,14 +861,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -895,7 +886,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -1030,12 +1022,12 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -1057,19 +1049,21 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_mask_processor (`InpaintProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
@@ -1125,17 +1119,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -1185,7 +1173,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
           control_image_latents (`Tensor`, *optional*):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1195,7 +1184,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 0c1fa00842e5..e1e5c4335481 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 from typing import Optional
+
 import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageCreateMaskLatentsStep,
     QwenImageEditRoPEInputsStep,
@@ -63,10 +64,8 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -113,9 +112,8 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -155,9 +153,8 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - create image latents.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -354,7 +351,10 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
+        QwenImageAdditionalInputsStep(
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -377,15 +377,14 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          pachifier (`QwenImagePachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -426,10 +425,8 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -502,10 +499,8 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -623,12 +618,12 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -650,19 +645,21 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_mask_processor (`InpaintProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
@@ -719,19 +716,14 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -771,7 +763,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 726c000f4b38..37656cef5d76 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageEditPlusRoPEInputsStep,
     QwenImagePrepareLatentsStep,
@@ -55,10 +54,8 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -107,9 +104,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
       Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -231,10 +227,8 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -311,12 +305,12 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -357,14 +351,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 37a06e9af254..fdfeab048835 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,14 +52,12 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -116,9 +113,8 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -203,8 +199,8 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
           width (`int`):
               if not provided, updated to image width
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
-              pachifier and batch-expanded)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -230,10 +226,8 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
-          pachifier (`QwenImageLayeredPachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -317,16 +311,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          pachifier (`QwenImageLayeredPachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           image (`Union[Image, List]`):

From 94525200fdbc55f1f2ed1c6ef64cba8cd990da21 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:35:39 +0100
Subject: [PATCH 19/23] rmove space in make docstring

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index a57212988e28..5468cf54d0fc 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -893,7 +893,7 @@ def make_doc_string(
     # Add description
     if description:
         desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line for line in desc_lines)
+        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
         output += aligned_desc + "\n\n"
 
     # Add components section if provided

From 7e9d2b954e734d382a138d69743025eab9f7aeba Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sun, 18 Jan 2026 22:44:44 -1000
Subject: [PATCH 20/23] Apply suggestions from code review

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py   | 2 +-
 src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 5468cf54d0fc..8116f26d39a3 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str = None
+    name: str 
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 418d927f4faa..aae6eb50d935 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -674,7 +674,7 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("image_latents"),
         ]
 
     @property

From b7127ce7a72ddffadaf70c334effb24cf0422649 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:54:40 +0100
Subject: [PATCH 21/23] revert change in z

---
 src/diffusers/modular_pipelines/z_image/denoise.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index a165fb513f3c..5f76a8459fde 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,7 +129,10 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam.denoiser_input_fields(),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+            ),
         ]
         guider_input_names = []
         uncond_guider_input_names = []

From 1f9576a2ca97c6bacef9f79b570c7b859b663b13 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:56:14 +0100
Subject: [PATCH 22/23] fix

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py   | 2 +-
 src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 8116f26d39a3..f3b12d716160 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str 
+    name: str
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index aae6eb50d935..3c9d29260d12 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -649,8 +649,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
-              generated from vae encoder and packed in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           timesteps (`Tensor`):

From 23d06423abf84f70414d2c42908fdd03485a7cf3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 19 Jan 2026 09:23:31 +0000
Subject: [PATCH 23/23] Apply style fixes

---
 .../pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py    | 1 -
 src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py  | 1 -
 .../pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py       | 1 -
 .../stable_diffusion/pipeline_stable_diffusion_latent_upscale.py | 1 -
 4 files changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 94c4c394465b..2ea7307fec32 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -84,7 +84,6 @@
         >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
         >>> from diffusers.utils import load_image
 
-
         >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
         >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
         >>> controlnet = ControlNetModel.from_pretrained(
diff --git a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
index d259f7ee7865..b41d9772a7cc 100644
--- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
+++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
@@ -53,7 +53,6 @@
         >>> from transformers import AutoTokenizer, LlamaForCausalLM
         >>> from diffusers import HiDreamImagePipeline
 
-
         >>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
         >>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
         ...     "meta-llama/Meta-Llama-3.1-8B-Instruct",
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
index df5b3f5c10a5..5a6b8d5e9f37 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -85,7 +85,6 @@
         >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
         >>> from diffusers.utils import load_image
 
-
         >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
         >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
         >>> controlnet = ControlNetModel.from_pretrained(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 66d5ffa6b849..a1d0407caf5e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -459,7 +459,6 @@ def __call__(
         >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
         >>> import torch
 
-
         >>> pipeline = StableDiffusionPipeline.from_pretrained(
         ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
         ... )