From 7b499de6d04eab1180dd86ab667c6a66a816f0d6 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 03:35:15 +0100 Subject: [PATCH 01/23] up --- .../modular_pipeline_utils.py | 127 +++++++++++++++++- .../qwenimage/before_denoise.py | 40 +++--- 2 files changed, 146 insertions(+), 21 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index aa421a53727b..afc4d6959a6f 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -17,7 +17,7 @@ from collections import OrderedDict from dataclasses import dataclass, field, fields from typing import Any, Dict, List, Literal, Optional, Type, Union - +import PIL.Image import torch from ..configuration_utils import ConfigMixin, FrozenDict @@ -342,6 +342,121 @@ class InputParam: def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" + @classmethod + def prompt(cls) -> "InputParam": + return cls(name="prompt", type_hint=str, required=True, + description="The prompt or prompts to guide image generation.") + + @classmethod + def negative_prompt(cls) -> "InputParam": + return cls(name="negative_prompt", type_hint=str, default=None, + description="The prompt or prompts not to guide the image generation.") + + @classmethod + def max_sequence_length(cls, default: int = 512) -> "InputParam": + return cls(name="max_sequence_length", type_hint=int, default=default, + description="Maximum sequence length for prompt encoding.") + + @classmethod + def height(cls, default: Optional[int] = None) -> "InputParam": + return cls(name="height", type_hint=int, default=default, + description="The height in pixels of the generated image.") + + @classmethod + def width(cls, default: Optional[int] = None) -> "InputParam": + return cls(name="width", type_hint=int, default=default, + description="The width in pixels of the generated image.") + + @classmethod + def num_inference_steps(cls, default: int = 50) -> "InputParam": + return cls(name="num_inference_steps", type_hint=int, default=default, + description="The number of denoising steps.") + + + @classmethod + def num_images_per_prompt(cls, default: int = 1) -> "InputParam": + return cls(name="num_images_per_prompt", type_hint=int, default=default, + description="The number of images to generate per prompt.") + + @classmethod + def generator(cls) -> "InputParam": + return cls(name="generator", type_hint=torch.Generator, default=None, + description="Torch generator for deterministic generation.") + + + @classmethod + def sigmas(cls) -> "InputParam": + return cls(name="sigmas", type_hint=List[float], default=None, + description="Custom sigmas for the denoising process.") + + @classmethod + def strength(cls, default: float = 0.9) -> "InputParam": + return cls(name="strength", type_hint=float, default=default, + description="Strength for img2img/inpainting.") + + @classmethod + def image(cls) -> "InputParam": + return cls(name="image", type_hint=PIL.Image.Image, required=True, + description="Input image for img2img, editing, or conditioning.") + + @classmethod + def mask_image(cls) -> "InputParam": + return cls(name="mask_image", type_hint=PIL.Image.Image, required=True, + description="Mask image for inpainting.") + + @classmethod + def control_image(cls) -> "InputParam": + return cls(name="control_image", type_hint=PIL.Image.Image, required=True, + description="Control image for ControlNet conditioning.") + + @classmethod + def padding_mask_crop(cls) -> "InputParam": + return cls(name="padding_mask_crop", type_hint=int, default=None, + description="Padding for mask cropping in inpainting.") + + + @classmethod + def latents(cls) -> "InputParam": + return cls(name="latents", type_hint=torch.Tensor, default=None, + description="Pre-generated noisy latents for image generation.") + + + @classmethod + def timesteps(cls) -> "InputParam": + return cls(name="timesteps", type_hint=torch.Tensor, default=None, + description="Timesteps for the denoising process.") + + + # ===================================================================== + # ControlNet + # ===================================================================== + + @classmethod + def control_guidance_start(cls, default: float = 0.0) -> "InputParam": + return cls(name="control_guidance_start", type_hint=float, default=default, + description="When to start applying ControlNet.") + + @classmethod + def control_guidance_end(cls, default: float = 1.0) -> "InputParam": + return cls(name="control_guidance_end", type_hint=float, default=default, + description="When to stop applying ControlNet.") + + @classmethod + def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam": + return cls(name="controlnet_conditioning_scale", type_hint=float, default=default, + description="Scale for ControlNet conditioning.") + + + @classmethod + def output_type(cls) -> "InputParam": + return cls(name="output_type", type_hint=str, default="pil", + description="Output format: 'pil', 'np', 'pt', or 'latent'.") + + @classmethod + def attention_kwargs(cls) -> "InputParam": + return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, + description="Additional kwargs for attention processors.") + @dataclass class OutputParam: @@ -357,6 +472,16 @@ def __repr__(self): f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>" ) + @classmethod + def images(cls) -> "OutputParam": + return cls(name="images", type_hint=List[PIL.Image.Image], + description="Generated images.") + + @classmethod + def latents(cls) -> "OutputParam": + return cls(name="latents", type_hint=torch.Tensor, + description="Denoised latents.") + def format_inputs_short(inputs): """ diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 0c66d6ea3303..6fa4a971c2c5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -134,11 +134,11 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents"), - InputParam(name="height"), - InputParam(name="width"), - InputParam(name="num_images_per_prompt", default=1), - InputParam(name="generator"), + InputParam.latents(), + InputParam.height(), + InputParam.width(), + InputParam.num_images_per_prompt(), + InputParam.generator(), InputParam( name="batch_size", required=True, @@ -225,12 +225,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents"), - InputParam(name="height"), - InputParam(name="width"), - InputParam(name="layers", default=4), - InputParam(name="num_images_per_prompt", default=1), - InputParam(name="generator"), + InputParam.latents(), + InputParam.height(), + InputParam.width(), + InputParam(name="layers", type_hint=int, default=4), + InputParam.num_images_per_prompt(), + InputParam.generator(), InputParam( name="batch_size", required=True, @@ -466,8 +466,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="num_inference_steps", default=50), - InputParam(name="sigmas"), + InputParam.num_inference_steps(), + InputParam.sigmas(), InputParam( name="latents", required=True, @@ -532,8 +532,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("num_inference_steps", default=50, type_hint=int), - InputParam("sigmas", type_hint=List[float]), + InputParam.num_inference_steps(), + InputParam.sigmas(), InputParam("image_latents", required=True, type_hint=torch.Tensor), ] @@ -590,8 +590,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="num_inference_steps", default=50), - InputParam(name="sigmas"), + InputParam.num_inference_steps(), + InputParam.sigmas(), InputParam( name="latents", required=True, @@ -971,9 +971,9 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam("control_guidance_start", default=0.0), - InputParam("control_guidance_end", default=1.0), - InputParam("controlnet_conditioning_scale", default=1.0), + InputParam.control_guidance_start(), + InputParam.control_guidance_end(), + InputParam.controlnet_conditioning_scale(), InputParam("control_image_latents", required=True), InputParam( "timesteps", From b29873dee72ea60e155a2a14a72e6e6ee6195b63 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 10:52:53 +0100 Subject: [PATCH 02/23] up up --- .../modular_pipeline_utils.py | 57 +++++++++++------ .../qwenimage/before_denoise.py | 6 +- .../modular_pipelines/qwenimage/decoders.py | 28 +++------ .../modular_pipelines/qwenimage/denoise.py | 16 ++--- .../modular_pipelines/qwenimage/encoders.py | 61 ++++++++++--------- .../modular_pipelines/qwenimage/inputs.py | 34 +++++------ .../qwenimage/modular_blocks_qwenimage.py | 26 ++++++-- .../modular_pipelines/z_image/denoise.py | 5 +- 8 files changed, 125 insertions(+), 108 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index afc4d6959a6f..cb179eccc7f7 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -342,6 +342,18 @@ class InputParam: def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" + + @classmethod + def template(cls, name: str) -> Optional["InputParam"]: + """Get template for name if exists, otherwise None.""" + if hasattr(cls, name) and callable(getattr(cls, name)): + return getattr(cls, name)() + return None + + # ====================================================== + # InputParam templates + # ====================================================== + @classmethod def prompt(cls) -> "InputParam": return cls(name="prompt", type_hint=str, required=True, @@ -383,7 +395,6 @@ def generator(cls) -> "InputParam": return cls(name="generator", type_hint=torch.Generator, default=None, description="Torch generator for deterministic generation.") - @classmethod def sigmas(cls) -> "InputParam": return cls(name="sigmas", type_hint=List[float], default=None, @@ -394,6 +405,7 @@ def strength(cls, default: float = 0.9) -> "InputParam": return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.") + # images @classmethod def image(cls) -> "InputParam": return cls(name="image", type_hint=PIL.Image.Image, required=True, @@ -425,12 +437,24 @@ def latents(cls) -> "InputParam": def timesteps(cls) -> "InputParam": return cls(name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process.") + + @classmethod + def output_type(cls) -> "InputParam": + return cls(name="output_type", type_hint=str, default="pil", + description="Output format: 'pil', 'np', 'pt''.") - - # ===================================================================== - # ControlNet - # ===================================================================== + @classmethod + def attention_kwargs(cls) -> "InputParam": + return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, + description="Additional kwargs for attention processors.") + + @classmethod + def denoiser_input_fields(cls) -> "InputParam": + return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, + description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.") + + # ControlNet @classmethod def control_guidance_start(cls, default: float = 0.0) -> "InputParam": return cls(name="control_guidance_start", type_hint=float, default=default, @@ -446,18 +470,6 @@ def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam": return cls(name="controlnet_conditioning_scale", type_hint=float, default=default, description="Scale for ControlNet conditioning.") - - @classmethod - def output_type(cls) -> "InputParam": - return cls(name="output_type", type_hint=str, default="pil", - description="Output format: 'pil', 'np', 'pt', or 'latent'.") - - @classmethod - def attention_kwargs(cls) -> "InputParam": - return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, - description="Additional kwargs for attention processors.") - - @dataclass class OutputParam: """Specification for an output parameter.""" @@ -472,6 +484,17 @@ def __repr__(self): f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>" ) + @classmethod + def template(cls, name: str) -> Optional["OutputParam"]: + """Get template for name if exists, otherwise None.""" + if hasattr(cls, name) and callable(getattr(cls, name)): + return getattr(cls, name)() + return None + + # ====================================================== + # OutputParam templates + # ====================================================== + @classmethod def images(cls) -> "OutputParam": return cls(name="images", type_hint=List[PIL.Image.Image], diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 6fa4a971c2c5..d61711e13a52 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -228,7 +228,7 @@ def inputs(self) -> List[InputParam]: InputParam.latents(), InputParam.height(), InputParam.width(), - InputParam(name="layers", type_hint=int, default=4), + InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"), InputParam.num_images_per_prompt(), InputParam.generator(), InputParam( @@ -598,7 +598,7 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The latents to use for the denoising process, used to calculate the image sequence length.", ), - InputParam(name="strength", default=0.9), + InputParam.strength(0.9), ] @property @@ -886,7 +886,7 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam(name="batch_size", required=True), - InputParam(name="layers", required=True), + InputParam(name="layers", default=4, description="Number of layers to extract from the image"), InputParam(name="height", required=True), InputParam(name="width", required=True), InputParam(name="prompt_embeds_mask"), diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 24a88ebfca3c..9c3a1c01d018 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -91,7 +91,7 @@ def inputs(self) -> List[InputParam]: InputParam("latents", required=True, type_hint=torch.Tensor), InputParam("height", required=True, type_hint=int), InputParam("width", required=True, type_hint=int), - InputParam("layers", required=True, type_hint=int), + InputParam("layers", default=4, description="Number of layers to extract from the image"), ] @torch.no_grad() @@ -141,11 +141,7 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[str]: return [ - OutputParam( - "images", - type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], - description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array", - ) + OutputParam.images() ] @torch.no_grad() @@ -198,14 +194,14 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("output_type", default="pil", type_hint=str), + InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"), + InputParam.output_type(), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]), + OutputParam.images(), ] @torch.no_grad() @@ -273,12 +269,7 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam("images", required=True, description="the generated image from decoders step"), - InputParam( - name="output_type", - default="pil", - type_hint=str, - description="The type of the output images, can be 'pil', 'np', 'pt'", - ), + InputParam.output_type(), ] @staticmethod @@ -323,12 +314,7 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam("images", required=True, description="the generated image from decoders step"), - InputParam( - name="output_type", - default="pil", - type_hint=str, - description="The type of the output images, can be 'pil', 'np', 'pt'", - ), + InputParam.output_type(), InputParam("mask_overlay_kwargs"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index eb1e5a341c68..472945b2269a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -218,7 +218,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("attention_kwargs"), + InputParam.attention_kwargs(), InputParam( "latents", required=True, @@ -231,10 +231,7 @@ def inputs(self) -> List[InputParam]: type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), InputParam( "img_shapes", required=True, @@ -322,7 +319,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("attention_kwargs"), + InputParam.attention_kwargs(), InputParam( "latents", required=True, @@ -335,10 +332,7 @@ def inputs(self) -> List[InputParam]: type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), InputParam( "img_shapes", required=True, @@ -424,7 +418,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."), + OutputParam.latents(), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 4b66dd32e521..2eca8645ef2c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -301,8 +301,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam( - name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" + InputParam.template(self._image_input_name) or InputParam( + name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning" ), ] @@ -381,7 +381,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam( + InputParam.template(self._image_input_name) or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), InputParam( @@ -484,7 +484,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam( + InputParam.template(self._image_input_name) or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, @@ -564,7 +564,7 @@ def expected_configs(self) -> List[ConfigSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", type_hint=str, description="The prompt to encode"), + InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -647,11 +647,9 @@ def expected_configs(self) -> List[ConfigSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), - InputParam( - name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024 - ), + InputParam.prompt(), + InputParam.negative_prompt(), + InputParam.max_sequence_length(1024), ] @property @@ -772,8 +770,8 @@ def expected_configs(self) -> List[ConfigSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam.prompt(), + InputParam.negative_prompt(), InputParam( name="resized_image", required=True, @@ -895,8 +893,8 @@ def expected_configs(self) -> List[ConfigSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam.prompt(), + InputParam.negative_prompt(), InputParam( name="resized_cond_image", required=True, @@ -1010,11 +1008,11 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("mask_image", required=True), - InputParam("image", required=True), - InputParam("height"), - InputParam("width"), - InputParam("padding_mask_crop"), + InputParam.mask_image(), + InputParam.image(), + InputParam.height(), + InputParam.width(), + InputParam.padding_mask_crop(), ] @property @@ -1082,9 +1080,9 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("mask_image", required=True), - InputParam("resized_image", required=True), - InputParam("padding_mask_crop"), + InputParam.mask_image(), + InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"), + InputParam.padding_mask_crop(), ] @property @@ -1140,9 +1138,9 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("image", required=True), - InputParam("height"), - InputParam("width"), + InputParam.image(), + InputParam.height(), + InputParam.width(), ] @property @@ -1312,7 +1310,10 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [InputParam(self._image_input_name, required=True), InputParam("generator")] + return [ + InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), + InputParam.generator(), + ] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -1383,10 +1384,10 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam("control_image", required=True), - InputParam("height"), - InputParam("width"), - InputParam("generator"), + InputParam.control_image(), + InputParam.height(), + InputParam.width(), + InputParam.generator(), ] return inputs diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 4a1cf3700c57..e28493ecc369 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -129,7 +129,7 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"), InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"), InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"), @@ -269,17 +269,17 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), - InputParam(name="height"), - InputParam(name="width"), + InputParam.height(), + InputParam.width(), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -398,17 +398,17 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), - InputParam(name="height"), - InputParam(name="width"), + InputParam.height(), + InputParam.width(), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -544,15 +544,15 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -638,9 +638,9 @@ def inputs(self) -> List[InputParam]: return [ InputParam(name="control_image_latents", required=True), InputParam(name="batch_size", required=True), - InputParam(name="num_images_per_prompt", default=1), - InputParam(name="height"), - InputParam(name="width"), + InputParam.num_images_per_prompt(), + InputParam.height(), + InputParam.width(), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 63e9f5a28372..c349c7d9f224 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -54,7 +54,23 @@ # ==================== -# 1. VAE ENCODER +# 1. TEXT ENCODER +# ==================== + +class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): + model_name = "qwenimage" + block_classes = [QwenImageTextEncoderStep()] + block_names = ["text_encoder"] + block_trigger_inputs = ["prompt"] + + @property + def description(self) -> str: + return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block." + " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided." + " - if `prompt` is not provided, step will be skipped." + +# ==================== +# 2. VAE ENCODER # ==================== @@ -118,7 +134,7 @@ def description(self): # ==================== -# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) +# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== @@ -396,7 +412,7 @@ def description(self): # ==================== -# 3. DECODE +# 4. DECODE # ==================== @@ -439,11 +455,11 @@ def description(self): # ==================== -# 4. AUTO BLOCKS & PRESETS +# 5. AUTO BLOCKS & PRESETS # ==================== AUTO_BLOCKS = InsertableDict( [ - ("text_encoder", QwenImageTextEncoderStep()), + ("text_encoder", QwenImageAutoTextEncoderStep()), ("vae_encoder", QwenImageAutoVaeEncoderStep()), ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), ("denoise", QwenImageAutoCoreDenoiseStep()), diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py index 3d5a00a9df50..a165fb513f3c 100644 --- a/src/diffusers/modular_pipelines/z_image/denoise.py +++ b/src/diffusers/modular_pipelines/z_image/denoise.py @@ -129,10 +129,7 @@ def inputs(self) -> List[Tuple[str, Any]]: type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), ] guider_input_names = [] uncond_guider_input_names = [] From 43ab14845d9cbf090e0de0f1f284bdec54008954 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 10:56:54 +0100 Subject: [PATCH 03/23] update outputs --- .../modular_pipelines/qwenimage/modular_blocks_qwenimage.py | 6 ++---- .../qwenimage/modular_blocks_qwenimage_edit.py | 6 ++---- .../qwenimage/modular_blocks_qwenimage_edit_plus.py | 6 ++---- .../qwenimage/modular_blocks_qwenimage_layered.py | 6 ++---- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index f58dffd922fc..e112578c399d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -418,9 +418,7 @@ def description(self): @property def outputs(self): return [ - OutputParam( - name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step" - ), + OutputParam.latents(), ] @@ -500,5 +498,5 @@ def description(self): @property def outputs(self): return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]), + OutputParam.images(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 2683e64080bf..30fcb842d591 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -313,9 +313,7 @@ def description(self): @property def outputs(self): return [ - OutputParam( - name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step" - ), + OutputParam.latents(), ] @@ -349,5 +347,5 @@ def description(self): @property def outputs(self): return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"), + OutputParam.images(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 99c5b109bf38..345b0cd93560 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -144,9 +144,7 @@ def description(self): @property def outputs(self): return [ - OutputParam( - name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step" - ), + OutputParam.latents(), ] @@ -196,5 +194,5 @@ def description(self): @property def outputs(self): return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"), + OutputParam.images(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 63ee36df5112..965f9e1976ad 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -142,9 +142,7 @@ def description(self): @property def outputs(self): return [ - OutputParam( - name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step" - ), + OutputParam.latents(), ] @@ -174,5 +172,5 @@ def description(self): @property def outputs(self): return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"), + OutputParam.images(), ] From 34a743e2dc36dc0ce7a86251ab3c4b74f89beb00 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 10:57:27 +0100 Subject: [PATCH 04/23] style --- .../modular_pipeline_utils.py | 191 +++++++++++------- .../qwenimage/before_denoise.py | 4 +- .../modular_pipelines/qwenimage/decoders.py | 15 +- .../modular_pipelines/qwenimage/encoders.py | 27 ++- .../qwenimage/modular_blocks_qwenimage.py | 6 +- .../modular_blocks_qwenimage_edit.py | 5 +- .../modular_blocks_qwenimage_edit_plus.py | 4 - .../modular_blocks_qwenimage_layered.py | 5 - 8 files changed, 155 insertions(+), 102 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index cb179eccc7f7..fab7c7193e5d 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -17,6 +17,7 @@ from collections import OrderedDict from dataclasses import dataclass, field, fields from typing import Any, Dict, List, Literal, Optional, Type, Union + import PIL.Image import torch @@ -342,7 +343,6 @@ class InputParam: def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" - @classmethod def template(cls, name: str) -> Optional["InputParam"]: """Get template for name if exists, otherwise None.""" @@ -356,119 +356,172 @@ def template(cls, name: str) -> Optional["InputParam"]: @classmethod def prompt(cls) -> "InputParam": - return cls(name="prompt", type_hint=str, required=True, - description="The prompt or prompts to guide image generation.") - + return cls( + name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation." + ) + @classmethod def negative_prompt(cls) -> "InputParam": - return cls(name="negative_prompt", type_hint=str, default=None, - description="The prompt or prompts not to guide the image generation.") - + return cls( + name="negative_prompt", + type_hint=str, + default=None, + description="The prompt or prompts not to guide the image generation.", + ) + @classmethod def max_sequence_length(cls, default: int = 512) -> "InputParam": - return cls(name="max_sequence_length", type_hint=int, default=default, - description="Maximum sequence length for prompt encoding.") - + return cls( + name="max_sequence_length", + type_hint=int, + default=default, + description="Maximum sequence length for prompt encoding.", + ) + @classmethod def height(cls, default: Optional[int] = None) -> "InputParam": - return cls(name="height", type_hint=int, default=default, - description="The height in pixels of the generated image.") - + return cls( + name="height", type_hint=int, default=default, description="The height in pixels of the generated image." + ) + @classmethod def width(cls, default: Optional[int] = None) -> "InputParam": - return cls(name="width", type_hint=int, default=default, - description="The width in pixels of the generated image.") + return cls( + name="width", type_hint=int, default=default, description="The width in pixels of the generated image." + ) @classmethod def num_inference_steps(cls, default: int = 50) -> "InputParam": - return cls(name="num_inference_steps", type_hint=int, default=default, - description="The number of denoising steps.") - - + return cls( + name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps." + ) + @classmethod def num_images_per_prompt(cls, default: int = 1) -> "InputParam": - return cls(name="num_images_per_prompt", type_hint=int, default=default, - description="The number of images to generate per prompt.") - + return cls( + name="num_images_per_prompt", + type_hint=int, + default=default, + description="The number of images to generate per prompt.", + ) + @classmethod def generator(cls) -> "InputParam": - return cls(name="generator", type_hint=torch.Generator, default=None, - description="Torch generator for deterministic generation.") - + return cls( + name="generator", + type_hint=torch.Generator, + default=None, + description="Torch generator for deterministic generation.", + ) + @classmethod def sigmas(cls) -> "InputParam": - return cls(name="sigmas", type_hint=List[float], default=None, - description="Custom sigmas for the denoising process.") - + return cls( + name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process." + ) + @classmethod def strength(cls, default: float = 0.9) -> "InputParam": - return cls(name="strength", type_hint=float, default=default, - description="Strength for img2img/inpainting.") - + return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.") + # images @classmethod def image(cls) -> "InputParam": - return cls(name="image", type_hint=PIL.Image.Image, required=True, - description="Input image for img2img, editing, or conditioning.") - + return cls( + name="image", + type_hint=PIL.Image.Image, + required=True, + description="Input image for img2img, editing, or conditioning.", + ) + @classmethod def mask_image(cls) -> "InputParam": - return cls(name="mask_image", type_hint=PIL.Image.Image, required=True, - description="Mask image for inpainting.") - + return cls( + name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting." + ) + @classmethod def control_image(cls) -> "InputParam": - return cls(name="control_image", type_hint=PIL.Image.Image, required=True, - description="Control image for ControlNet conditioning.") - + return cls( + name="control_image", + type_hint=PIL.Image.Image, + required=True, + description="Control image for ControlNet conditioning.", + ) + @classmethod def padding_mask_crop(cls) -> "InputParam": - return cls(name="padding_mask_crop", type_hint=int, default=None, - description="Padding for mask cropping in inpainting.") - + return cls( + name="padding_mask_crop", + type_hint=int, + default=None, + description="Padding for mask cropping in inpainting.", + ) @classmethod def latents(cls) -> "InputParam": - return cls(name="latents", type_hint=torch.Tensor, default=None, - description="Pre-generated noisy latents for image generation.") - - + return cls( + name="latents", + type_hint=torch.Tensor, + default=None, + description="Pre-generated noisy latents for image generation.", + ) + @classmethod def timesteps(cls) -> "InputParam": - return cls(name="timesteps", type_hint=torch.Tensor, default=None, - description="Timesteps for the denoising process.") + return cls( + name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process." + ) @classmethod def output_type(cls) -> "InputParam": - return cls(name="output_type", type_hint=str, default="pil", - description="Output format: 'pil', 'np', 'pt''.") - + return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.") + @classmethod def attention_kwargs(cls) -> "InputParam": - return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, - description="Additional kwargs for attention processors.") + return cls( + name="attention_kwargs", + type_hint=Dict[str, Any], + default=None, + description="Additional kwargs for attention processors.", + ) @classmethod def denoiser_input_fields(cls) -> "InputParam": - return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.") - + return cls( + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", + ) # ControlNet @classmethod def control_guidance_start(cls, default: float = 0.0) -> "InputParam": - return cls(name="control_guidance_start", type_hint=float, default=default, - description="When to start applying ControlNet.") - + return cls( + name="control_guidance_start", + type_hint=float, + default=default, + description="When to start applying ControlNet.", + ) + @classmethod def control_guidance_end(cls, default: float = 1.0) -> "InputParam": - return cls(name="control_guidance_end", type_hint=float, default=default, - description="When to stop applying ControlNet.") - + return cls( + name="control_guidance_end", + type_hint=float, + default=default, + description="When to stop applying ControlNet.", + ) + @classmethod def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam": - return cls(name="controlnet_conditioning_scale", type_hint=float, default=default, - description="Scale for ControlNet conditioning.") + return cls( + name="controlnet_conditioning_scale", + type_hint=float, + default=default, + description="Scale for ControlNet conditioning.", + ) + @dataclass class OutputParam: @@ -497,13 +550,11 @@ def template(cls, name: str) -> Optional["OutputParam"]: @classmethod def images(cls) -> "OutputParam": - return cls(name="images", type_hint=List[PIL.Image.Image], - description="Generated images.") - + return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.") + @classmethod def latents(cls) -> "OutputParam": - return cls(name="latents", type_hint=torch.Tensor, - description="Denoised latents.") + return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.") def format_inputs_short(inputs): diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index d61711e13a52..cb808b1d3807 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -228,7 +228,9 @@ def inputs(self) -> List[InputParam]: InputParam.latents(), InputParam.height(), InputParam.width(), - InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"), + InputParam( + name="layers", type_hint=int, default=4, description="Number of layers to extract from the image" + ), InputParam.num_images_per_prompt(), InputParam.generator(), InputParam( diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 9c3a1c01d018..8207e99b69ae 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +from typing import List -import numpy as np -import PIL import torch from ...configuration_utils import FrozenDict @@ -140,9 +138,7 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[str]: - return [ - OutputParam.images() - ] + return [OutputParam.images()] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -194,7 +190,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"), + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The latents to decode, can be generated in the denoise step", + ), InputParam.output_type(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 2eca8645ef2c..f0dd6471b168 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -301,8 +301,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam( - name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning" + InputParam.template(self._image_input_name) + or InputParam( + name=self._image_input_name, + required=True, + type_hint=torch.Tensor, + description="Input image for conditioning", ), ] @@ -381,7 +385,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam( + InputParam.template(self._image_input_name) + or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), InputParam( @@ -484,7 +489,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam( + InputParam.template(self._image_input_name) + or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, @@ -564,7 +570,9 @@ def expected_configs(self) -> List[ConfigSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines + InputParam( + name="prompt", type_hint=str, description="The prompt to encode" + ), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -1081,7 +1089,12 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam.mask_image(), - InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"), + InputParam( + "resized_image", + required=True, + type_hint=PIL.Image.Image, + description="The resized image. should be generated using a resize step", + ), InputParam.padding_mask_crop(), ] @@ -1311,7 +1324,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), + InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), InputParam.generator(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index e112578c399d..d6117a12a57d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List - -import PIL.Image -import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks @@ -62,6 +58,7 @@ # 1. TEXT ENCODER # ==================== + class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): model_name = "qwenimage" block_classes = [QwenImageTextEncoderStep()] @@ -74,6 +71,7 @@ def description(self) -> str: " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided." " - if `prompt` is not provided, step will be skipped." + # ==================== # 2. VAE ENCODER # ==================== diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 30fcb842d591..14d0945dbe57 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -12,10 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional - -import PIL.Image -import torch +from typing import Optional from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 345b0cd93560..fbe5e60f353f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List - -import PIL.Image -import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 965f9e1976ad..e91a5c40b19b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -13,11 +13,6 @@ # limitations under the License. -from typing import List - -import PIL.Image -import torch - from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict, OutputParam From ff09bf1a631e38683205217e8dba4961de090319 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 11:55:03 +0100 Subject: [PATCH 05/23] add modular_auto_docstring! --- .../qwenimage/modular_blocks_qwenimage.py | 814 +++++++++++++++++- utils/modular_auto_docstring.py | 296 +++++++ 2 files changed, 1104 insertions(+), 6 deletions(-) create mode 100644 utils/modular_auto_docstring.py diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index d6117a12a57d..19feffe77eda 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -58,8 +58,59 @@ # 1. TEXT ENCODER # ==================== - +#auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): + """ + class QwenImageAutoTextEncoderStep + + Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. + + Components: + + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + + tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 34) + + tokenizer_max_length (default: 1024) + + Inputs: + + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + Outputs: + + prompt_embeds (`Tensor`): + The prompt embeddings + + prompt_embeds_mask (`Tensor`): + The encoder attention mask + + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings + + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask + """ model_name = "qwenimage" block_classes = [QwenImageTextEncoderStep()] block_names = ["text_encoder"] @@ -76,8 +127,54 @@ def description(self) -> str: # 2. VAE ENCODER # ==================== - +#auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintVaeEncoderStep + + This step is used for processing image and mask inputs for inpainting tasks. It: + - Resizes the image to the target size, based on `height` and `width`. + - Processes and updates `image` and `mask_image`. + - Creates `image_latents`. + + Components: + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + mask_image (`Image`): + Mask image for inpainting. + + image (`Image`): + Input image for img2img, editing, or conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + processed_image (`None`): + + processed_mask_image (`None`): + + mask_overlay_kwargs (`Dict`): + The kwargs for the postprocess step to apply the mask overlay + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @@ -92,7 +189,40 @@ def description(self) -> str: ) +#auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgVaeEncoderStep + + Vae encoder step that preprocess andencode the image inputs into their latent representations. + + Components: + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + processed_image (`None`): + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage" block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -103,7 +233,6 @@ def description(self) -> str: return "Vae encoder step that preprocess andencode the image inputs into their latent representations." -# Auto VAE encoder class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] block_names = ["inpaint", "img2img"] @@ -121,7 +250,43 @@ def description(self): # optional controlnet vae encoder +#auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): + """ + class QwenImageOptionalControlNetVaeEncoderStep + + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block. + - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. + - if `control_image` is not provided, step will be skipped. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + control_image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + control_image (`Image`, *optional*): + Control image for ControlNet conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + control_image_latents (`Tensor`): + The latents representing the control image + """ block_classes = [QwenImageControlNetVaeEncoderStep] block_names = ["controlnet"] block_trigger_inputs = ["control_image"] @@ -142,7 +307,52 @@ def description(self): # assemble input steps +#auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgInputStep + + Input step that prepares the inputs for the img2img denoising step. It: + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] block_names = ["text_inputs", "additional_inputs"] @@ -154,7 +364,54 @@ def description(self): " - update height/width based `image_latents`, patchify `image_latents`." +#auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintInputStep + + Input step that prepares the inputs for the inpainting denoising step. It: + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -172,7 +429,49 @@ def description(self): # assemble prepare latents steps +#auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintPrepareLatentsStep + + This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: + - Add noise to the image latents to create the latents input for the denoiser. + - Create the pachified latents `mask` based on the processedmask image. + + Components: + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The initial random noised, can be generated in prepare latent step. + + image_latents (`Tensor`): + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + + processed_mask_image (`Tensor`): + The processed mask to use for the inpainting process. + + height (`None`): + + width (`None`): + + dtype (`None`): + + Outputs: + + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + + mask (`Tensor`): + The mask to use for the inpainting process. + """ model_name = "qwenimage" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -190,7 +489,66 @@ def description(self) -> str: # Qwen Image (text2image) +#auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageCoreDenoiseStep + + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -212,10 +570,81 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (inpainting) +#auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageInpaintInputStep(), @@ -240,9 +669,78 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (image2image) +#auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageImg2ImgInputStep(), @@ -267,9 +765,87 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (text2image) with controlnet +#auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetCoreDenoiseStep + + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + control_image_latents (`None`): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -295,10 +871,95 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (inpainting) with controlnet +#auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetInpaintCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + control_image_latents (`None`): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageInpaintInputStep(), @@ -327,9 +988,93 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] + # Qwen Image (image2image) with controlnet +#auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetImg2ImgCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + control_image_latents (`None`): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageImg2ImgInputStep(), @@ -357,7 +1102,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Auto denoise step for QwenImage class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): @@ -426,7 +1176,32 @@ def outputs(self): # standard decode step works for most tasks except for inpaint +#auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -437,7 +1212,34 @@ def description(self): # Inpaint decode step +#auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + mask_overlay_kwargs (`None`, *optional*): + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py new file mode 100644 index 000000000000..c6aaf8a46a56 --- /dev/null +++ b/utils/modular_auto_docstring.py @@ -0,0 +1,296 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Auto Docstring Generator for Modular Pipeline Blocks + +This script scans Python files for classes that have `# auto_docstring` comment above them +and inserts/updates the docstring from the class's `doc` property. + +Run from the root of the repo: + python utils/modular_auto_docstring.py [path] [--fix_and_overwrite] + +Examples: + # Check for auto_docstring markers (will error if found without proper docstring) + python utils/modular_auto_docstring.py + + # Check specific directory + python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/ + + # Fix and overwrite the docstrings + python utils/modular_auto_docstring.py --fix_and_overwrite + +Usage in code: + # auto_docstring + class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): + # docstring will be automatically inserted here + + @property + def doc(self): + return "Your docstring content..." +""" + +import argparse +import ast +import glob +import importlib +import os +import re +import sys + + +# All paths are set with the intent you should run this script from the root of the repo +DIFFUSERS_PATH = "src/diffusers" +REPO_PATH = "." + +# Pattern to match the auto_docstring comment +AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$") + + +def setup_diffusers_import(): + """Setup import path to use the local diffusers module.""" + src_path = os.path.join(REPO_PATH, "src") + if src_path not in sys.path: + sys.path.insert(0, src_path) + + +def get_module_from_filepath(filepath: str) -> str: + """Convert a filepath to a module name.""" + filepath = os.path.normpath(filepath) + + if filepath.startswith("src" + os.sep): + filepath = filepath[4:] + + if filepath.endswith(".py"): + filepath = filepath[:-3] + + module_name = filepath.replace(os.sep, ".") + return module_name + + +def load_module(filepath: str): + """Load a module from filepath.""" + setup_diffusers_import() + module_name = get_module_from_filepath(filepath) + + try: + module = importlib.import_module(module_name) + return module + except Exception as e: + print(f"Warning: Could not import module {module_name}: {e}") + return None + + +def get_doc_from_class(module, class_name: str) -> str: + """Get the doc property from an instantiated class.""" + if module is None: + return None + + cls = getattr(module, class_name, None) + if cls is None: + return None + + try: + instance = cls() + if hasattr(instance, "doc"): + return instance.doc + except Exception as e: + print(f"Warning: Could not instantiate {class_name}: {e}") + + return None + + +def find_auto_docstring_classes(filepath: str) -> list: + """ + Find all classes in a file that have # auto_docstring comment above them. + + Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line) + """ + with open(filepath, "r", encoding="utf-8", newline="\n") as f: + lines = f.readlines() + + # Parse AST to find class locations and their docstrings + content = "".join(lines) + try: + tree = ast.parse(content) + except SyntaxError as e: + print(f"Syntax error in {filepath}: {e}") + return [] + + # Build a map of class_name -> (class_line, has_docstring, docstring_end_line) + class_info = {} + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + has_docstring = False + docstring_end_line = node.lineno # default to class line + + if node.body and isinstance(node.body[0], ast.Expr): + first_stmt = node.body[0] + if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str): + has_docstring = True + docstring_end_line = first_stmt.end_lineno or first_stmt.lineno + + class_info[node.name] = (node.lineno, has_docstring, docstring_end_line) + + # Now scan for # auto_docstring comments + classes_to_update = [] + + for i, line in enumerate(lines): + if AUTO_DOCSTRING_PATTERN.match(line): + # Found the marker, look for class definition on next non-empty, non-comment line + j = i + 1 + while j < len(lines): + next_line = lines[j].strip() + if next_line and not next_line.startswith("#"): + break + j += 1 + + if j < len(lines) and lines[j].strip().startswith("class "): + # Extract class name + match = re.match(r"class\s+(\w+)", lines[j].strip()) + if match: + class_name = match.group(1) + if class_name in class_info: + class_line, has_docstring, docstring_end_line = class_info[class_name] + classes_to_update.append(( + class_name, + class_line, + has_docstring, + docstring_end_line + )) + + return classes_to_update + + +def format_docstring(doc: str, indent: str = " ") -> str: + """Format a doc string as a properly indented docstring.""" + lines = doc.strip().split("\n") + + if len(lines) == 1: + return f'{indent}"""{lines[0]}"""\n' + else: + result = [f'{indent}"""\n'] + for line in lines: + if line.strip(): + result.append(f"{indent}{line}\n") + else: + result.append("\n") + result.append(f'{indent}"""\n') + return "".join(result) + + +def process_file(filepath: str, overwrite: bool = False) -> list: + """ + Process a file and find/insert docstrings for # auto_docstring marked classes. + + Returns list of classes that need updating. + """ + classes_to_update = find_auto_docstring_classes(filepath) + + if not classes_to_update: + return [] + + if not overwrite: + # Just return the list of classes that need updating + return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] + + # Load the module to get doc properties + module = load_module(filepath) + + with open(filepath, "r", encoding="utf-8", newline="\n") as f: + lines = f.readlines() + + # Process in reverse order to maintain line numbers + updated = False + for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update): + doc = get_doc_from_class(module, class_name) + + if doc is None: + print(f"Warning: Could not get doc for {class_name} in {filepath}") + continue + + # Format the new docstring with 4-space indent + new_docstring = format_docstring(doc, " ") + + if has_docstring: + # Replace existing docstring (line after class definition to docstring_end_line) + # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line + lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:] + else: + # Insert new docstring right after class definition line + # class_line is 1-indexed, so lines[class_line-1] is the class line + # Insert at position class_line (which is right after the class line) + lines = lines[:class_line] + [new_docstring] + lines[class_line:] + + updated = True + print(f"Updated docstring for {class_name} in {filepath}") + + if updated: + with open(filepath, "w", encoding="utf-8", newline="\n") as f: + f.writelines(lines) + + return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] + + +def check_auto_docstrings(path: str = None, overwrite: bool = False): + """ + Check all files for # auto_docstring markers and optionally fix them. + """ + if path is None: + path = DIFFUSERS_PATH + + if os.path.isfile(path): + all_files = [path] + else: + all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True) + + all_markers = [] + + for filepath in all_files: + markers = process_file(filepath, overwrite) + all_markers.extend(markers) + + if not overwrite and len(all_markers) > 0: + message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers]) + raise ValueError( + f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n" + f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them." + ) + + if overwrite and len(all_markers) > 0: + print(f"\nUpdated {len(all_markers)} docstring(s).") + elif len(all_markers) == 0: + print("No # auto_docstring markers found.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check and fix # auto_docstring markers in modular pipeline blocks", + ) + parser.add_argument( + "path", + nargs="?", + default=None, + help="File or directory to process (default: src/diffusers)" + ) + parser.add_argument( + "--fix_and_overwrite", + action="store_true", + help="Whether to fix the docstrings by inserting them from doc property.", + ) + + args = parser.parse_args() + + check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file From d20f413f78822e9513bd60c203bf0f58885b3a54 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:11:28 +0100 Subject: [PATCH 06/23] more auto docstring --- .../modular_blocks_qwenimage_edit.py | 471 +++++++++++++++++- .../modular_blocks_qwenimage_edit_plus.py | 226 ++++++++- .../modular_blocks_qwenimage_layered.py | 245 ++++++++- 3 files changed, 935 insertions(+), 7 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 14d0945dbe57..cae6236eb5aa 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -55,9 +55,62 @@ # 1. TEXT ENCODER # ==================== - +#auto_docstring class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): - """VL encoder that takes both image and text prompts.""" + """ + class QwenImageEditVLEncoderStep + + QwenImage-Edit VL encoder step that encode the image and text prompts together. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 64) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + prompt (`str`): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + Outputs: + + resized_image (`List`): + The resized images + + prompt_embeds (`Tensor`): + The prompt embeddings + + prompt_embeds_mask (`Tensor`): + The encoder attention mask + + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings + + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask + """ model_name = "qwenimage-edit" block_classes = [ @@ -77,7 +130,39 @@ def description(self) -> str: # Edit VAE encoder +#auto_docstring class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageEditVaeEncoderStep + + Vae encoder step that encode the image inputs into their latent representations. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + resized_image (`List`): + The resized images + + processed_image (`None`): + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -92,7 +177,53 @@ def description(self) -> str: # Edit Inpaint VAE encoder +#auto_docstring class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageEditInpaintVaeEncoderStep + + This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It: + - resize the image for target area (1024 * 1024) while maintaining the aspect ratio. + - process the resized image and mask image. + - create image latents. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + mask_image (`Image`): + Mask image for inpainting. + + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + resized_image (`List`): + The resized images + + processed_image (`None`): + + processed_mask_image (`None`): + + mask_overlay_kwargs (`Dict`): + The kwargs for the postprocess step to apply the mask overlay + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -134,7 +265,54 @@ def description(self): # assemble input steps +#auto_docstring class QwenImageEditInputStep(SequentialPipelineBlocks): + """ + class QwenImageEditInputStep + + Input step that prepares the inputs for the edit denoising step. It: + - make sure the text embeddings have consistent batch size as well as the additional inputs. + - update height/width based `image_latents`, patchify `image_latents`. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -151,7 +329,56 @@ def description(self): ) +#auto_docstring class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): + """ + class QwenImageEditInpaintInputStep + + Input step that prepares the inputs for the edit inpaint denoising step. It: + - make sure the text embeddings have consistent batch size as well as the additional inputs. + - update height/width based `image_latents`, patchify `image_latents`. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -171,7 +398,49 @@ def description(self): # assemble prepare latents steps +#auto_docstring class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): + """ + class QwenImageEditInpaintPrepareLatentsStep + + This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It: + - Add noise to the image latents to create the latents input for the denoiser. + - Create the patchified latents `mask` based on the processed mask image. + + Components: + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The initial random noised, can be generated in prepare latent step. + + image_latents (`Tensor`): + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + + processed_mask_image (`Tensor`): + The processed mask to use for the inpainting process. + + height (`None`): + + width (`None`): + + dtype (`None`): + + Outputs: + + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + + mask (`Tensor`): + The mask to use for the inpainting process. + """ model_name = "qwenimage-edit" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -186,7 +455,68 @@ def description(self) -> str: # Qwen Image Edit (image2image) core denoise step +#auto_docstring class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageEditCoreDenoiseStep + + Core denoising workflow for QwenImage-Edit edit (img2img) task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditInputStep(), @@ -209,9 +539,81 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Core denoising workflow for QwenImage-Edit edit (img2img) task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] + # Qwen Image Edit (inpainting) core denoise step +#auto_docstring class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageEditInpaintCoreDenoiseStep + + Core denoising workflow for QwenImage-Edit edit inpaint task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditInpaintInputStep(), @@ -236,6 +638,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Core denoising workflow for QwenImage-Edit edit inpaint task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] + # Auto core denoise step for QwenImage Edit class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): @@ -263,7 +671,12 @@ def description(self): " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." ) - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # ==================== # 4. DECODE @@ -271,7 +684,32 @@ def description(self): # Decode step (standard) +#auto_docstring class QwenImageEditDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageEditDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -282,7 +720,34 @@ def description(self): # Inpaint decode step +#auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageEditInpaintDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + mask_overlay_kwargs (`None`, *optional*): + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index fbe5e60f353f..2fcd633f0d7f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -49,8 +49,64 @@ # ==================== +#auto_docstring class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): - """VL encoder that takes both image and text prompts. Uses 384x384 target area.""" + """ + class QwenImageEditPlusVLEncoderStep + + QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) + + prompt_template_encode_start_idx (default: 64) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + prompt (`str`): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + Outputs: + + resized_cond_image (`List`): + The resized images + + prompt_embeds (`Tensor`): + The prompt embeddings + + prompt_embeds_mask (`Tensor`): + The encoder attention mask + + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings + + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask + """ model_name = "qwenimage-edit-plus" block_classes = [ @@ -69,8 +125,40 @@ def description(self) -> str: # ==================== +#auto_docstring class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): - """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area.""" + """ + class QwenImageEditPlusVaeEncoderStep + + VAE encoder step that encodes image inputs into latent representations. + Each image is resized independently based on its own aspect ratio to 1024x1024 target area. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + resized_image (`List`): + The resized images + + processed_image (`None`): + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage-edit-plus" block_classes = [ @@ -94,7 +182,56 @@ def description(self) -> str: # assemble input steps +#auto_docstring class QwenImageEditPlusInputStep(SequentialPipelineBlocks): + """ + class QwenImageEditPlusInputStep + + Input step that prepares the inputs for the Edit Plus denoising step. It: + - Standardizes text embeddings batch size. + - Processes list of image latents: patchifies, concatenates along dim=1, expands batch. + - Outputs lists of image_height/image_width for RoPE calculation. + - Defaults height/width from last image in the list. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`List`): + The image heights calculated from the image latents dimension + + image_width (`List`): + The image widths calculated from the image latents dimension + """ model_name = "qwenimage-edit-plus" block_classes = [ QwenImageTextInputsStep(), @@ -114,7 +251,67 @@ def description(self): # Qwen Image Edit Plus (image2image) core denoise step +#auto_docstring class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageEditPlusCoreDenoiseStep + + Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage-edit-plus" block_classes = [ QwenImageEditPlusInputStep(), @@ -149,7 +346,32 @@ def outputs(self): # ==================== +#auto_docstring class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageEditPlusDecodeStep + + Decode step that decodes the latents to images and postprocesses the generated image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-edit-plus" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index e91a5c40b19b..f647f16868ab 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -49,9 +49,111 @@ # 1. TEXT ENCODER # ==================== - +#auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): - """Text encoder that takes text prompt, will generate a prompt based on image if not provided.""" + """ + class QwenImageLayeredTextEncoderStep + + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + Configs: + + image_caption_prompt_en (default: <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: + 1. Write the caption using natural, descriptive language without structured formats or rich text. + 2. Enrich caption details by including: + - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + 3. Maintain authenticity and accuracy: + - Avoid generalizations + - Describe all visible information in the image, while do not add information not explicitly shown in the image + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) + + image_caption_prompt_cn (default: <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: + 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 + 2. 通过加入以下内容,丰富图注细节: + - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 + - 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等 + - 环境细节:例如天气、光照、颜色、纹理、气氛等 + - 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调 + 3. 保持真实性与准确性: + - 不要使用笼统的描述 + - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode (default: <|im_start|>system + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 34) + + tokenizer_max_length (default: 1024) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + resolution (`int`, *optional*, defaults to 640): + The target area to resize the image to, can be 1024 or 640 + + prompt (`str`, *optional*): + The prompt to encode + + use_en_prompt (`bool`, *optional*, defaults to False): + Whether to use English prompt template + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + Outputs: + + resized_image (`List`): + The resized images + + prompt_embeds (`Tensor`): + The prompt embeddings + + prompt_embeds_mask (`Tensor`): + The encoder attention mask + + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings + + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask + """ model_name = "qwenimage-layered" block_classes = [ @@ -72,7 +174,42 @@ def description(self) -> str: # Edit VAE encoder +#auto_docstring class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageLayeredVaeEncoderStep + + Vae encoder step that encode the image inputs into their latent representations. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + resolution (`int`, *optional*, defaults to 640): + The target area to resize the image to, can be 1024 or 640 + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + resized_image (`List`): + The resized images + + processed_image (`None`): + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredResizeStep(), @@ -93,7 +230,54 @@ def description(self) -> str: # assemble input steps +#auto_docstring class QwenImageLayeredInputStep(SequentialPipelineBlocks): + """ + class QwenImageLayeredInputStep + + Input step that prepares the inputs for the layered denoising step. It: + - make sure the text embeddings have consistent batch size as well as the additional inputs. + - update height/width based `image_latents`, patchify `image_latents`. + + Components: + + pachifier (`QwenImageLayeredPachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + image_latents (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + + height (`int`): + The height of the image output + + width (`int`): + The width of the image output + """ model_name = "qwenimage-layered" block_classes = [ QwenImageTextInputsStep(), @@ -111,7 +295,64 @@ def description(self): # Qwen Image Layered (image2image) core denoise step +#auto_docstring class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageLayeredCoreDenoiseStep + + Core denoising workflow for QwenImage-Layered img2img task. + + Components: + + pachifier (`QwenImageLayeredPachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + image_latents (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + layers (`int`, *optional*, defaults to 4): + Number of layers to extract from the image + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredInputStep(), From 2a81f2ec5417efdc7773937dd7db2f675a46b66a Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:15:36 +0100 Subject: [PATCH 07/23] style --- .../qwenimage/modular_blocks_qwenimage.py | 86 ++++++++++++------- .../modular_blocks_qwenimage_edit.py | 46 ++++++---- .../modular_blocks_qwenimage_edit_plus.py | 26 +++--- .../modular_blocks_qwenimage_layered.py | 47 +++++----- 4 files changed, 116 insertions(+), 89 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 19feffe77eda..d54dca5f5ad6 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -58,7 +58,8 @@ # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): """ class QwenImageAutoTextEncoderStep @@ -76,11 +77,8 @@ class QwenImageAutoTextEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -111,6 +109,7 @@ class QwenImageAutoTextEncoderStep negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ + model_name = "qwenimage" block_classes = [QwenImageTextEncoderStep()] block_names = ["text_encoder"] @@ -127,7 +126,8 @@ def description(self) -> str: # 2. VAE ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageInpaintVaeEncoderStep @@ -175,6 +175,7 @@ class QwenImageInpaintVaeEncoderStep image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @@ -189,7 +190,7 @@ def description(self) -> str: ) -#auto_docstring +# auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgVaeEncoderStep @@ -223,6 +224,7 @@ class QwenImageImg2ImgVaeEncoderStep image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage" block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -250,13 +252,12 @@ def description(self): # optional controlnet vae encoder -#auto_docstring +# auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ class QwenImageOptionalControlNetVaeEncoderStep - Vae encoder step that encode the image inputs into their latent representations. - This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. @@ -287,6 +288,7 @@ class QwenImageOptionalControlNetVaeEncoderStep control_image_latents (`Tensor`): The latents representing the control image """ + block_classes = [QwenImageControlNetVaeEncoderStep] block_names = ["controlnet"] block_trigger_inputs = ["control_image"] @@ -307,7 +309,7 @@ def description(self): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgInputStep @@ -353,6 +355,7 @@ class QwenImageImg2ImgInputStep image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] block_names = ["text_inputs", "additional_inputs"] @@ -364,7 +367,7 @@ def description(self): " - update height/width based `image_latents`, patchify `image_latents`." -#auto_docstring +# auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): """ class QwenImageInpaintInputStep @@ -412,6 +415,7 @@ class QwenImageInpaintInputStep image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -429,7 +433,7 @@ def description(self): # assemble prepare latents steps -#auto_docstring +# auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ class QwenImageInpaintPrepareLatentsStep @@ -450,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -472,6 +477,7 @@ class QwenImageInpaintPrepareLatentsStep mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -489,12 +495,13 @@ def description(self) -> str: # Qwen Image (text2image) -#auto_docstring +# auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -570,20 +577,22 @@ class QwenImageCoreDenoiseStep @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Qwen Image (inpainting) -#auto_docstring +# auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -675,13 +684,15 @@ def outputs(self): OutputParam.latents(), ] + # Qwen Image (image2image) -#auto_docstring +# auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -771,13 +782,15 @@ def outputs(self): OutputParam.latents(), ] + # Qwen Image (text2image) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -871,20 +884,22 @@ class QwenImageControlNetCoreDenoiseStep @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Qwen Image (inpainting) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -996,12 +1011,13 @@ def outputs(self): # Qwen Image (image2image) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -1102,13 +1118,14 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep @property def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Auto denoise step for QwenImage class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ @@ -1176,7 +1193,7 @@ def outputs(self): # standard decode step works for most tasks except for inpaint -#auto_docstring +# auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): """ class QwenImageDecodeStep @@ -1202,6 +1219,7 @@ class QwenImageDecodeStep images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -1212,12 +1230,13 @@ def description(self): # Inpaint decode step -#auto_docstring +# auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1240,6 +1259,7 @@ class QwenImageInpaintDecodeStep images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index cae6236eb5aa..37a438ea1f54 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -55,7 +55,8 @@ # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditVLEncoderStep @@ -75,11 +76,10 @@ class QwenImageEditVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -130,7 +130,7 @@ def description(self) -> str: # Edit VAE encoder -#auto_docstring +# auto_docstring class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditVaeEncoderStep @@ -163,6 +163,7 @@ class QwenImageEditVaeEncoderStep image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -177,7 +178,7 @@ def description(self) -> str: # Edit Inpaint VAE encoder -#auto_docstring +# auto_docstring class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintVaeEncoderStep @@ -224,6 +225,7 @@ class QwenImageEditInpaintVaeEncoderStep image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -265,7 +267,7 @@ def description(self): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageEditInputStep(SequentialPipelineBlocks): """ class QwenImageEditInputStep @@ -313,6 +315,7 @@ class QwenImageEditInputStep image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -329,7 +332,7 @@ def description(self): ) -#auto_docstring +# auto_docstring class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintInputStep @@ -379,6 +382,7 @@ class QwenImageEditInpaintInputStep image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -398,7 +402,7 @@ def description(self): # assemble prepare latents steps -#auto_docstring +# auto_docstring class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintPrepareLatentsStep @@ -419,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -441,6 +446,7 @@ class QwenImageEditInpaintPrepareLatentsStep mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage-edit" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -455,7 +461,7 @@ def description(self) -> str: # Qwen Image Edit (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditCoreDenoiseStep @@ -547,7 +553,7 @@ def outputs(self): # Qwen Image Edit (inpainting) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintCoreDenoiseStep @@ -671,20 +677,21 @@ def description(self): " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." ) - + @property def outputs(self): return [ OutputParam.latents(), ] + # ==================== # 4. DECODE # ==================== # Decode step (standard) -#auto_docstring +# auto_docstring class QwenImageEditDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditDecodeStep @@ -710,6 +717,7 @@ class QwenImageEditDecodeStep images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -720,12 +728,13 @@ def description(self): # Inpaint decode step -#auto_docstring +# auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -748,6 +757,7 @@ class QwenImageEditInpaintDecodeStep images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 2fcd633f0d7f..851b69f232e7 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -49,7 +49,7 @@ # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVLEncoderStep @@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -125,13 +124,13 @@ def description(self) -> str: # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVaeEncoderStep - VAE encoder step that encodes image inputs into latent representations. - Each image is resized independently based on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based + on its own aspect ratio to 1024x1024 target area. Components: @@ -182,7 +181,7 @@ def description(self) -> str: # assemble input steps -#auto_docstring +# auto_docstring class QwenImageEditPlusInputStep(SequentialPipelineBlocks): """ class QwenImageEditPlusInputStep @@ -232,6 +231,7 @@ class QwenImageEditPlusInputStep image_width (`List`): The image widths calculated from the image latents dimension """ + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageTextInputsStep(), @@ -251,7 +251,7 @@ def description(self): # Qwen Image Edit Plus (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditPlusCoreDenoiseStep @@ -312,6 +312,7 @@ class QwenImageEditPlusCoreDenoiseStep latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageEditPlusInputStep(), @@ -346,7 +347,7 @@ def outputs(self): # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditPlusDecodeStep @@ -372,6 +373,7 @@ class QwenImageEditPlusDecodeStep images (`List`): Generated images. """ + model_name = "qwenimage-edit-plus" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index f647f16868ab..56fa1345a5ce 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -49,12 +49,14 @@ # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredTextEncoderStep - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -71,28 +73,23 @@ class QwenImageLayeredTextEncoderStep Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -102,16 +99,11 @@ class QwenImageLayeredTextEncoderStep 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -174,7 +166,7 @@ def description(self) -> str: # Edit VAE encoder -#auto_docstring +# auto_docstring class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredVaeEncoderStep @@ -210,6 +202,7 @@ class QwenImageLayeredVaeEncoderStep image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredResizeStep(), @@ -230,7 +223,7 @@ def description(self) -> str: # assemble input steps -#auto_docstring +# auto_docstring class QwenImageLayeredInputStep(SequentialPipelineBlocks): """ class QwenImageLayeredInputStep @@ -278,6 +271,7 @@ class QwenImageLayeredInputStep width (`int`): The width of the image output """ + model_name = "qwenimage-layered" block_classes = [ QwenImageTextInputsStep(), @@ -295,7 +289,7 @@ def description(self): # Qwen Image Layered (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageLayeredCoreDenoiseStep @@ -353,6 +347,7 @@ class QwenImageLayeredCoreDenoiseStep latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredInputStep(), From f0555af1c6be0adb75404f2724a071d8b49b5506 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:15:53 +0100 Subject: [PATCH 08/23] up up up --- utils/modular_auto_docstring.py | 90 +++++++++++++++------------------ 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py index c6aaf8a46a56..e2d523b2f378 100644 --- a/utils/modular_auto_docstring.py +++ b/utils/modular_auto_docstring.py @@ -36,7 +36,7 @@ # auto_docstring class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): # docstring will be automatically inserted here - + @property def doc(self): return "Your docstring content..." @@ -69,13 +69,13 @@ def setup_diffusers_import(): def get_module_from_filepath(filepath: str) -> str: """Convert a filepath to a module name.""" filepath = os.path.normpath(filepath) - + if filepath.startswith("src" + os.sep): filepath = filepath[4:] - + if filepath.endswith(".py"): filepath = filepath[:-3] - + module_name = filepath.replace(os.sep, ".") return module_name @@ -84,7 +84,7 @@ def load_module(filepath: str): """Load a module from filepath.""" setup_diffusers_import() module_name = get_module_from_filepath(filepath) - + try: module = importlib.import_module(module_name) return module @@ -97,30 +97,30 @@ def get_doc_from_class(module, class_name: str) -> str: """Get the doc property from an instantiated class.""" if module is None: return None - + cls = getattr(module, class_name, None) if cls is None: return None - + try: instance = cls() if hasattr(instance, "doc"): return instance.doc except Exception as e: print(f"Warning: Could not instantiate {class_name}: {e}") - + return None def find_auto_docstring_classes(filepath: str) -> list: """ Find all classes in a file that have # auto_docstring comment above them. - + Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line) """ with open(filepath, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() - + # Parse AST to find class locations and their docstrings content = "".join(lines) try: @@ -128,25 +128,25 @@ def find_auto_docstring_classes(filepath: str) -> list: except SyntaxError as e: print(f"Syntax error in {filepath}: {e}") return [] - + # Build a map of class_name -> (class_line, has_docstring, docstring_end_line) class_info = {} for node in ast.walk(tree): if isinstance(node, ast.ClassDef): has_docstring = False docstring_end_line = node.lineno # default to class line - + if node.body and isinstance(node.body[0], ast.Expr): first_stmt = node.body[0] if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str): has_docstring = True docstring_end_line = first_stmt.end_lineno or first_stmt.lineno - + class_info[node.name] = (node.lineno, has_docstring, docstring_end_line) - + # Now scan for # auto_docstring comments classes_to_update = [] - + for i, line in enumerate(lines): if AUTO_DOCSTRING_PATTERN.match(line): # Found the marker, look for class definition on next non-empty, non-comment line @@ -156,7 +156,7 @@ def find_auto_docstring_classes(filepath: str) -> list: if next_line and not next_line.startswith("#"): break j += 1 - + if j < len(lines) and lines[j].strip().startswith("class "): # Extract class name match = re.match(r"class\s+(\w+)", lines[j].strip()) @@ -164,20 +164,15 @@ def find_auto_docstring_classes(filepath: str) -> list: class_name = match.group(1) if class_name in class_info: class_line, has_docstring, docstring_end_line = class_info[class_name] - classes_to_update.append(( - class_name, - class_line, - has_docstring, - docstring_end_line - )) - + classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line)) + return classes_to_update def format_docstring(doc: str, indent: str = " ") -> str: """Format a doc string as a properly indented docstring.""" lines = doc.strip().split("\n") - + if len(lines) == 1: return f'{indent}"""{lines[0]}"""\n' else: @@ -194,36 +189,36 @@ def format_docstring(doc: str, indent: str = " ") -> str: def process_file(filepath: str, overwrite: bool = False) -> list: """ Process a file and find/insert docstrings for # auto_docstring marked classes. - + Returns list of classes that need updating. """ classes_to_update = find_auto_docstring_classes(filepath) - + if not classes_to_update: return [] - + if not overwrite: # Just return the list of classes that need updating return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] - + # Load the module to get doc properties module = load_module(filepath) - + with open(filepath, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() - + # Process in reverse order to maintain line numbers updated = False for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update): doc = get_doc_from_class(module, class_name) - + if doc is None: print(f"Warning: Could not get doc for {class_name} in {filepath}") continue - + # Format the new docstring with 4-space indent new_docstring = format_docstring(doc, " ") - + if has_docstring: # Replace existing docstring (line after class definition to docstring_end_line) # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line @@ -233,14 +228,14 @@ def process_file(filepath: str, overwrite: bool = False) -> list: # class_line is 1-indexed, so lines[class_line-1] is the class line # Insert at position class_line (which is right after the class line) lines = lines[:class_line] + [new_docstring] + lines[class_line:] - + updated = True print(f"Updated docstring for {class_name} in {filepath}") - + if updated: with open(filepath, "w", encoding="utf-8", newline="\n") as f: f.writelines(lines) - + return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] @@ -250,25 +245,25 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False): """ if path is None: path = DIFFUSERS_PATH - + if os.path.isfile(path): all_files = [path] else: all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True) - + all_markers = [] - + for filepath in all_files: markers = process_file(filepath, overwrite) all_markers.extend(markers) - + if not overwrite and len(all_markers) > 0: message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers]) raise ValueError( f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n" f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them." ) - + if overwrite and len(all_markers) > 0: print(f"\nUpdated {len(all_markers)} docstring(s).") elif len(all_markers) == 0: @@ -279,18 +274,13 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False): parser = argparse.ArgumentParser( description="Check and fix # auto_docstring markers in modular pipeline blocks", ) - parser.add_argument( - "path", - nargs="?", - default=None, - help="File or directory to process (default: src/diffusers)" - ) + parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)") parser.add_argument( "--fix_and_overwrite", action="store_true", help="Whether to fix the docstrings by inserting them from doc property.", ) - + args = parser.parse_args() - - check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file + + check_auto_docstrings(args.path, args.fix_and_overwrite) From 507953f4156349d4d96cc6a8e0e7aa8eeefcf47e Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:19:14 +0100 Subject: [PATCH 09/23] more more --- .../qwenimage/modular_blocks_qwenimage.py | 168 +++++++++++++++--- .../modular_blocks_qwenimage_edit.py | 118 +++++++++++- .../modular_blocks_qwenimage_edit_plus.py | 102 ++++++++++- .../modular_blocks_qwenimage_layered.py | 165 +++++++++++++++-- 4 files changed, 503 insertions(+), 50 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index d54dca5f5ad6..7f18de4f99dd 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -77,8 +77,11 @@ class QwenImageAutoTextEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -257,7 +260,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ class QwenImageOptionalControlNetVaeEncoderStep - Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. @@ -454,8 +458,7 @@ class QwenImageInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -500,8 +503,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: @@ -591,8 +593,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: @@ -691,8 +692,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: @@ -789,8 +789,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: @@ -898,8 +897,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: @@ -1016,8 +1014,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: @@ -1235,8 +1232,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask - overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. Components: @@ -1298,8 +1294,140 @@ def description(self): ] ) - +# auto_docstring class QwenImageAutoBlocks(SequentialPipelineBlocks): + """ + class QwenImageAutoBlocks + + Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. + - for image-to-image generation, you need to provide `image` + - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - to run the controlnet workflow, you need to provide `control_image` + - for text-to-image generation, all you need to provide is `prompt` + + Components: + + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + + tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + control_image_processor (`VaeImageProcessor`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 34) + + tokenizer_max_length (default: 1024) + + Inputs: + + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + mask_image (`Image`, *optional*): + Mask image for inpainting. + + image (`Image`, *optional*): + Input image for img2img, editing, or conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + control_image (`Image`, *optional*): + Control image for ControlNet conditioning. + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + latents (`Tensor`): + Pre-generated noisy latents for image generation. + + num_inference_steps (`int`): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + control_image_latents (`None`, *optional*): + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + mask_overlay_kwargs (`None`, *optional*): + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage" block_classes = AUTO_BLOCKS.values() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 37a438ea1f54..91efe9dda2bf 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -76,10 +76,11 @@ class QwenImageEditVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) @@ -423,8 +424,7 @@ class QwenImageEditInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -733,8 +733,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask - overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. Components: @@ -802,8 +801,109 @@ def outputs(self): ] ) - +# auto_docstring class QwenImageEditAutoBlocks(SequentialPipelineBlocks): + """ + class QwenImageEditAutoBlocks + + Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. + - for edit (img2img) generation, you need to provide `image` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 64) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + prompt (`str`): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + mask_image (`Image`, *optional*): + Mask image for inpainting. + + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + height (`int`): + The height in pixels of the generated image. + + width (`int`): + The width in pixels of the generated image. + + image_latents (`None`): + + processed_mask_image (`None`, *optional*): + + latents (`Tensor`): + Pre-generated noisy latents for image generation. + + num_inference_steps (`int`): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + mask_overlay_kwargs (`None`, *optional*): + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-edit" block_classes = EDIT_AUTO_BLOCKS.values() block_names = EDIT_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 851b69f232e7..3a780daf9602 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -69,10 +69,11 @@ class QwenImageEditPlusVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -129,8 +130,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVaeEncoderStep - VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based - on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. + Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: @@ -396,8 +397,95 @@ def description(self): ] ) - +# auto_docstring class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): + """ + class QwenImageEditPlusAutoBlocks + + Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus. + - `image` is required input (can be single image or list of images). + - Each image is resized independently based on its own aspect ratio. + - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) + + prompt_template_encode_start_idx (default: 64) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + prompt (`str`): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-edit-plus" block_classes = EDIT_PLUS_AUTO_BLOCKS.values() block_names = EDIT_PLUS_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 56fa1345a5ce..7cb5cd7a1ca3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -55,8 +55,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredTextEncoderStep - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not - provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. Components: @@ -73,23 +72,28 @@ class QwenImageLayeredTextEncoderStep Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -99,11 +103,16 @@ class QwenImageLayeredTextEncoderStep 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -390,8 +399,136 @@ def outputs(self): ] ) - +# auto_docstring class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): + """ + class QwenImageLayeredAutoBlocks + + Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. + + Components: + + image_resize_processor (`VaeImageProcessor`) [subfolder=] + + text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + + processor (`Qwen2VLProcessor`) [subfolder=] + + tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + pachifier (`QwenImageLayeredPachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Configs: + + image_caption_prompt_en (default: <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: + 1. Write the caption using natural, descriptive language without structured formats or rich text. + 2. Enrich caption details by including: + - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + 3. Maintain authenticity and accuracy: + - Avoid generalizations + - Describe all visible information in the image, while do not add information not explicitly shown in the image + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) + + image_caption_prompt_cn (default: <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: + 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 + 2. 通过加入以下内容,丰富图注细节: + - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 + - 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等 + - 环境细节:例如天气、光照、颜色、纹理、气氛等 + - 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调 + 3. 保持真实性与准确性: + - 不要使用笼统的描述 + - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode (default: <|im_start|>system + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 34) + + tokenizer_max_length (default: 1024) + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + resolution (`int`, *optional*, defaults to 640): + The target area to resize the image to, can be 1024 or 640 + + prompt (`str`, *optional*): + The prompt to encode + + use_en_prompt (`bool`, *optional*, defaults to False): + Whether to use English prompt template + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + layers (`int`, *optional*, defaults to 4): + Number of layers to extract from the image + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage-layered" block_classes = LAYERED_AUTO_BLOCKS.values() block_names = LAYERED_AUTO_BLOCKS.keys() From 1c90ce33f2445b29c1967976a1734db97f5eaa3a Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:21:26 +0100 Subject: [PATCH 10/23] up --- .../qwenimage/modular_blocks_qwenimage.py | 47 +++++++------ .../modular_blocks_qwenimage_edit.py | 29 ++++---- .../modular_blocks_qwenimage_edit_plus.py | 24 +++---- .../modular_blocks_qwenimage_layered.py | 69 +++++++------------ 4 files changed, 79 insertions(+), 90 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 7f18de4f99dd..85b77c2a6b93 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -77,11 +77,8 @@ class QwenImageAutoTextEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -260,8 +257,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ class QwenImageOptionalControlNetVaeEncoderStep - Vae encoder step that encode the image inputs into their latent representations. - This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. @@ -458,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -503,7 +500,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -593,7 +591,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -692,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -789,7 +789,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -897,7 +898,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -1014,7 +1016,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -1232,7 +1235,8 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1294,6 +1298,7 @@ def description(self): ] ) + # auto_docstring class QwenImageAutoBlocks(SequentialPipelineBlocks): """ @@ -1301,7 +1306,7 @@ class QwenImageAutoBlocks Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. - for image-to-image generation, you need to provide `image` - - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` - to run the controlnet workflow, you need to provide `control_image` - for text-to-image generation, all you need to provide is `prompt` @@ -1332,11 +1337,8 @@ class QwenImageAutoBlocks Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -1428,6 +1430,7 @@ class QwenImageAutoBlocks images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = AUTO_BLOCKS.values() @@ -1438,7 +1441,7 @@ def description(self): return ( "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n" + "- to run the controlnet workflow, you need to provide `control_image`\n" + "- for text-to-image generation, all you need to provide is `prompt`" ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 91efe9dda2bf..3fcbc8853f48 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -76,11 +76,10 @@ class QwenImageEditVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -424,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -733,7 +733,8 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -801,6 +802,7 @@ def outputs(self): ] ) + # auto_docstring class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ @@ -808,7 +810,8 @@ class QwenImageEditAutoBlocks Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: @@ -835,11 +838,10 @@ class QwenImageEditAutoBlocks Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -904,6 +906,7 @@ class QwenImageEditAutoBlocks images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = EDIT_AUTO_BLOCKS.values() block_names = EDIT_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 3a780daf9602..0364e394d29d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -130,8 +129,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVaeEncoderStep - VAE encoder step that encodes image inputs into latent representations. - Each image is resized independently based on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based + on its own aspect ratio to 1024x1024 target area. Components: @@ -397,6 +396,7 @@ def description(self): ] ) + # auto_docstring class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): """ @@ -430,11 +430,10 @@ class QwenImageEditPlusAutoBlocks Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -486,6 +485,7 @@ class QwenImageEditPlusAutoBlocks images (`List`): Generated images. """ + model_name = "qwenimage-edit-plus" block_classes = EDIT_PLUS_AUTO_BLOCKS.values() block_names = EDIT_PLUS_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 7cb5cd7a1ca3..5602fc9b93e5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -55,7 +55,8 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredTextEncoderStep - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -72,28 +73,23 @@ class QwenImageLayeredTextEncoderStep Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -103,16 +99,11 @@ class QwenImageLayeredTextEncoderStep 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -399,6 +390,7 @@ def outputs(self): ] ) + # auto_docstring class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): """ @@ -431,28 +423,23 @@ class QwenImageLayeredAutoBlocks Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -462,16 +449,11 @@ class QwenImageLayeredAutoBlocks 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -529,6 +511,7 @@ class QwenImageLayeredAutoBlocks images (`List`): Generated images. """ + model_name = "qwenimage-layered" block_classes = LAYERED_AUTO_BLOCKS.values() block_names = LAYERED_AUTO_BLOCKS.keys() From aea0d046f6eb759dca55a11bd9c55f89db39b3e4 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 17 Jan 2026 09:36:58 +0100 Subject: [PATCH 11/23] address feedbacks --- .../modular_pipeline_utils.py | 4 +- .../qwenimage/modular_blocks_qwenimage.py | 408 ++++-------------- .../modular_blocks_qwenimage_edit.py | 256 +++-------- .../modular_blocks_qwenimage_edit_plus.py | 147 ++----- .../modular_blocks_qwenimage_layered.py | 190 +++----- utils/modular_auto_docstring.py | 16 +- 6 files changed, 271 insertions(+), 750 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index fab7c7193e5d..368fbbcbd138 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -711,7 +711,7 @@ def wrap_text(text, indent, max_length): formatted_params.append(param_str) - return "\n\n".join(formatted_params) + return "\n".join(formatted_params) def format_input_params(input_params, indent_level=4, max_line_length=115): @@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty loading_field_values = [] for field_name in component.loading_fields(): field_value = getattr(component, field_name) - if field_value is not None: + if field_value: loading_field_values.append(f"{field_name}={field_value}") # Add loading field information if available diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 85b77c2a6b93..3bd4ae56832a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -62,50 +62,44 @@ # auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): """ - class QwenImageAutoTextEncoderStep - - Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. + Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - prompt (`str`, *optional*): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. Outputs: - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -130,48 +124,36 @@ def description(self) -> str: # auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageInpaintVaeEncoderStep - - This step is used for processing image and mask inputs for inpainting tasks. It: + This step is used for processing image and mask inputs for inpainting tasks. It: - Resizes the image to the target size, based on `height` and `width`. - Processes and updates `image` and `mask_image`. - Creates `image_latents`. Components: - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - mask_image (`Image`): Mask image for inpainting. - image (`Image`): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - processed_mask_image (`None`): - mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -193,34 +175,26 @@ def description(self) -> str: # auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgVaeEncoderStep - - Vae encoder step that preprocess andencode the image inputs into their latent representations. + Vae encoder step that preprocess andencode the image inputs into their latent representations. Components: - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -255,36 +229,30 @@ def description(self): # auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ - class QwenImageOptionalControlNetVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) [subfolder=] + control_image_processor (`VaeImageProcessor`) Inputs: - control_image (`Image`, *optional*): Control image for ControlNet conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - control_image_latents (`Tensor`): The latents representing the control image """ @@ -312,46 +280,32 @@ def description(self): # auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgInputStep - - Input step that prepares the inputs for the img2img denoising step. It: + Input step that prepares the inputs for the img2img denoising step. It: Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -370,48 +324,33 @@ def description(self): # auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): """ - class QwenImageInpaintInputStep - - Input step that prepares the inputs for the inpainting denoising step. It: + Input step that prepares the inputs for the inpainting denoising step. It: Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -436,44 +375,32 @@ def description(self): # auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ - class QwenImageInpaintPrepareLatentsStep - - This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: + This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: - Add noise to the image latents to create the latents input for the denoiser. - Create the pachified latents `mask` based on the processedmask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - latents (`Tensor`): The initial random noised, can be generated in prepare latent step. - image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. - + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. - processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - width (`None`): - dtype (`None`): Outputs: - initial_noise (`Tensor`): The initial random noised used for inpainting denoising. - mask (`Tensor`): The mask to use for the inpainting process. """ @@ -498,60 +425,43 @@ def description(self) -> str: # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageCoreDenoiseStep - - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -589,67 +499,47 @@ def outputs(self): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageInpaintCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -689,65 +579,46 @@ def outputs(self): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -787,74 +658,53 @@ def outputs(self): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetCoreDenoiseStep - - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - control_image_latents (`None`): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -896,81 +746,57 @@ def outputs(self): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetInpaintCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - control_image_latents (`None`): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -1014,79 +840,56 @@ def outputs(self): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetImg2ImgCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - control_image_latents (`None`): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -1196,26 +999,21 @@ def outputs(self): # auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): """ - class QwenImageDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image. + Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -1233,29 +1031,22 @@ def description(self): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - class QwenImageInpaintDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask - overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ @@ -1302,131 +1093,102 @@ def description(self): # auto_docstring class QwenImageAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageAutoBlocks - - Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. + Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. - for image-to-image generation, you need to provide `image` - - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`. - to run the controlnet workflow, you need to provide `control_image` - for text-to-image generation, all you need to provide is `prompt` Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) [subfolder=] + control_image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - prompt (`str`, *optional*): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. - mask_image (`Image`, *optional*): Mask image for inpainting. - image (`Image`, *optional*): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - control_image (`Image`, *optional*): Control image for ControlNet conditioning. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - latents (`Tensor`): Pre-generated noisy latents for image generation. - num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_image_latents (`None`, *optional*): - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 3fcbc8853f48..627cfce6ee7b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -59,55 +59,46 @@ # auto_docstring class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditVLEncoderStep - - QwenImage-Edit VL encoder step that encode the image and text prompts together. + QwenImage-Edit VL encoder step that encode the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. Outputs: - resized_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -133,33 +124,26 @@ def description(self) -> str: # auto_docstring class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. + Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -181,47 +165,36 @@ def description(self) -> str: # auto_docstring class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintVaeEncoderStep - - This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It: + This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It: - resize the image for target area (1024 * 1024) while maintaining the aspect ratio. - process the resized image and mask image. - create image latents. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - mask_image (`Image`): Mask image for inpainting. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - processed_mask_image (`None`): - mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -270,48 +243,34 @@ def description(self): # auto_docstring class QwenImageEditInputStep(SequentialPipelineBlocks): """ - class QwenImageEditInputStep - - Input step that prepares the inputs for the edit denoising step. It: + Input step that prepares the inputs for the edit denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -335,50 +294,35 @@ def description(self): # auto_docstring class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintInputStep - - Input step that prepares the inputs for the edit inpaint denoising step. It: + Input step that prepares the inputs for the edit inpaint denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -405,44 +349,32 @@ def description(self): # auto_docstring class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintPrepareLatentsStep - - This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It: + This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It: - Add noise to the image latents to create the latents input for the denoiser. - Create the patchified latents `mask` based on the processed mask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - latents (`Tensor`): The initial random noised, can be generated in prepare latent step. - image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. - + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. - processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - width (`None`): - dtype (`None`): Outputs: - initial_noise (`Tensor`): The initial random noised used for inpainting denoising. - mask (`Tensor`): The mask to use for the inpainting process. """ @@ -464,61 +396,44 @@ def description(self) -> str: # auto_docstring class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit edit (img2img) task. + Core denoising workflow for QwenImage-Edit edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -556,66 +471,47 @@ def outputs(self): # auto_docstring class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit edit inpaint task. + Core denoising workflow for QwenImage-Edit edit inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -694,26 +590,21 @@ def outputs(self): # auto_docstring class QwenImageEditDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image. + Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -731,29 +622,22 @@ def description(self): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask - overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ @@ -806,103 +690,81 @@ def outputs(self): # auto_docstring class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageEditAutoBlocks - - Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. + Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide - `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - mask_image (`Image`, *optional*): Mask image for inpainting. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - height (`int`): The height in pixels of the generated image. - width (`int`): The width in pixels of the generated image. - image_latents (`None`): - processed_mask_image (`None`, *optional*): - latents (`Tensor`): Pre-generated noisy latents for image generation. - num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 0364e394d29d..cc07fc1e6a75 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -52,57 +52,48 @@ # auto_docstring class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusVLEncoderStep - - QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. + QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. Outputs: - resized_cond_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -127,34 +118,27 @@ def description(self) -> str: # auto_docstring class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusVaeEncoderStep - - VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based - on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. + Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -184,9 +168,7 @@ def description(self) -> str: # auto_docstring class QwenImageEditPlusInputStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusInputStep - - Input step that prepares the inputs for the Edit Plus denoising step. It: + Input step that prepares the inputs for the Edit Plus denoising step. It: - Standardizes text embeddings batch size. - Processes list of image latents: patchifies, concatenates along dim=1, expands batch. - Outputs lists of image_height/image_width for RoPE calculation. @@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`List`): The image heights calculated from the image latents dimension - image_width (`List`): The image widths calculated from the image latents dimension """ @@ -254,61 +224,44 @@ def description(self): # auto_docstring class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. + Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -350,26 +303,21 @@ def outputs(self): # auto_docstring class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusDecodeStep - - Decode step that decodes the latents to images and postprocesses the generated image. + Decode step that decodes the latents to images and postprocesses the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -400,88 +348,73 @@ def description(self): # auto_docstring class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageEditPlusAutoBlocks - - Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus. + Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus. - `image` is required input (can be single image or list of images). - Each image is resized independently based on its own aspect ratio. - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 5602fc9b93e5..7cbc174871b5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -53,43 +53,45 @@ # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - class QwenImageLayeredTextEncoderStep - - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not - provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - prompt (`str`, *optional*): The prompt to encode - use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. Outputs: - resized_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -169,36 +165,28 @@ def description(self) -> str: # auto_docstring class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageLayeredVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. + Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -226,48 +214,34 @@ def description(self) -> str: # auto_docstring class QwenImageLayeredInputStep(SequentialPipelineBlocks): """ - class QwenImageLayeredInputStep - - Input step that prepares the inputs for the layered denoising step. It: + Input step that prepares the inputs for the layered denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension - height (`int`): The height of the image output - width (`int`): The width of the image output """ @@ -292,58 +266,42 @@ def description(self): # auto_docstring class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageLayeredCoreDenoiseStep - - Core denoising workflow for QwenImage-Layered img2img task. + Core denoising workflow for QwenImage-Layered img2img task. Components: - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -394,52 +352,55 @@ def outputs(self): # auto_docstring class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageLayeredAutoBlocks - - Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. + Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - prompt (`str`, *optional*): The prompt to encode - use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py index e2d523b2f378..01d984a58430 100644 --- a/utils/modular_auto_docstring.py +++ b/utils/modular_auto_docstring.py @@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list: return classes_to_update +def strip_class_name_line(doc: str, class_name: str) -> str: + """Remove the 'class ClassName' line from the doc if present.""" + lines = doc.strip().split("\n") + if lines and lines[0].strip() == f"class {class_name}": + # Remove the class line and any blank line following it + lines = lines[1:] + while lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + def format_docstring(doc: str, indent: str = " ") -> str: """Format a doc string as a properly indented docstring.""" lines = doc.strip().split("\n") @@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list: print(f"Warning: Could not get doc for {class_name} in {filepath}") continue + # Remove the "class ClassName" line since it's redundant in a docstring + doc = strip_class_name_line(doc, class_name) + # Format the new docstring with 4-space indent new_docstring = format_docstring(doc, " ") @@ -283,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False): args = parser.parse_args() - check_auto_docstrings(args.path, args.fix_and_overwrite) + check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file From 25c968a38f991b020d12604eedb4efda1d016dee Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 17 Jan 2026 09:57:56 +0100 Subject: [PATCH 12/23] add TODO in the description for empty docstring --- .../modular_pipeline_utils.py | 2 + .../modular_pipelines/qwenimage/encoders.py | 3 +- .../qwenimage/modular_blocks_qwenimage.py | 97 +++++++++++++++---- .../modular_blocks_qwenimage_edit.py | 59 ++++++++--- .../modular_blocks_qwenimage_edit_plus.py | 29 ++++-- .../modular_blocks_qwenimage_layered.py | 78 +++++++-------- utils/modular_auto_docstring.py | 2 +- 7 files changed, 184 insertions(+), 86 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 368fbbcbd138..45556c538ab8 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -708,6 +708,8 @@ def wrap_text(text, indent, max_length): desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description) wrapped_desc = wrap_text(desc, desc_indent, max_line_length) param_str += f"\n{desc_indent}{wrapped_desc}" + else: + param_str += f"\n{desc_indent}TODO: Add description." formatted_params.append(param_str) diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index f0dd6471b168..8d7b1905423d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -1324,7 +1324,8 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), + InputParam.template(self._image_input_name) + or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"), InputParam.generator(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 3bd4ae56832a..645c01f66ee5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): Outputs: processed_image (`None`): + TODO: Add description. processed_mask_image (`None`): + TODO: Add description. mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): @@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Outputs: processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. height (`None`): + TODO: Add description. width (`None`): + TODO: Add description. dtype (`None`): + TODO: Add description. Outputs: initial_noise (`Tensor`): @@ -425,7 +440,8 @@ def description(self) -> str: # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: @@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. height (`int`, *optional*): @@ -499,7 +519,8 @@ def outputs(self): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: @@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -579,7 +606,8 @@ def outputs(self): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: @@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -658,7 +691,8 @@ def outputs(self): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: @@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -746,7 +785,8 @@ def outputs(self): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: @@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -840,7 +887,8 @@ def outputs(self): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: @@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -1031,7 +1085,8 @@ def description(self): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): @@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. control_image_latents (`None`, *optional*): + TODO: Add description. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. control_guidance_end (`float`, *optional*, defaults to 1.0): @@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 627cfce6ee7b..0bfbb921c9c4 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. processed_mask_image (`None`): + TODO: Add description. mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): @@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. height (`None`): + TODO: Add description. width (`None`): + TODO: Add description. dtype (`None`): + TODO: Add description. Outputs: initial_noise (`Tensor`): @@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -622,7 +650,8 @@ def description(self): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): @@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: @@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): width (`int`): The width in pixels of the generated image. image_latents (`None`): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index cc07fc1e6a75..8dab6fbcf95d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 7cbc174871b5..544b1abfc3ed 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -53,7 +53,8 @@ # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. layers (`int`, *optional*, defaults to 4): @@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py index 01d984a58430..7bb2c87e81da 100644 --- a/utils/modular_auto_docstring.py +++ b/utils/modular_auto_docstring.py @@ -297,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False): args = parser.parse_args() - check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file + check_auto_docstrings(args.path, args.fix_and_overwrite) From de03d7f1005777cc3bfdf9107bb8b775311fce8d Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sun, 18 Jan 2026 00:35:01 +0100 Subject: [PATCH 13/23] refactor based on dhruv's feedback: remove the class method --- .../modular_pipeline_utils.py | 343 ++++++++---------- 1 file changed, 147 insertions(+), 196 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 45556c538ab8..f8dde1fbd096 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -324,6 +324,133 @@ class ConfigSpec: description: Optional[str] = None +# ====================================================== +# InputParam and OutputParam templates +# ====================================================== + +INPUT_PARAM_TEMPLATES = { + "prompt": { + "type_hint": str, + "required": True, + "description": "The prompt or prompts to guide image generation.", + }, + "negative_prompt": { + "type_hint": str, + "default": None, + "description": "The prompt or prompts not to guide the image generation.", + }, + "max_sequence_length": { + "type_hint": int, + "default": 512, + "description": "Maximum sequence length for prompt encoding.", + }, + "height": { + "type_hint": int, + "description": "The height in pixels of the generated image.", + }, + "width": { + "type_hint": int, + "description": "The width in pixels of the generated image.", + }, + "num_inference_steps": { + "type_hint": int, + "default": 50, + "description": "The number of denoising steps.", + }, + "num_images_per_prompt": { + "type_hint": int, + "default": 1, + "description": "The number of images to generate per prompt.", + }, + "generator": { + "type_hint": torch.Generator, + "default": None, + "description": "Torch generator for deterministic generation.", + }, + "sigmas": { + "type_hint": List[float], + "default": None, + "description": "Custom sigmas for the denoising process.", + }, + "strength": { + "type_hint": float, + "default": 0.9, + "description": "Strength for img2img/inpainting.", + }, + "image": { + "type_hint": PIL.Image.Image, + "required": True, + "description": "Input image for img2img, editing, or conditioning.", + }, + "mask_image": { + "type_hint": PIL.Image.Image, + "required": True, + "description": "Mask image for inpainting.", + }, + "control_image": { + "type_hint": PIL.Image.Image, + "required": True, + "description": "Control image for ControlNet conditioning.", + }, + "padding_mask_crop": { + "type_hint": int, + "default": None, + "description": "Padding for mask cropping in inpainting.", + }, + "latents": { + "type_hint": torch.Tensor, + "default": None, + "description": "Pre-generated noisy latents for image generation.", + }, + "timesteps": { + "type_hint": torch.Tensor, + "default": None, + "description": "Timesteps for the denoising process.", + }, + "output_type": { + "type_hint": str, + "default": "pil", + "description": "Output format: 'pil', 'np', 'pt'.", + }, + "attention_kwargs": { + "type_hint": Dict[str, Any], + "default": None, + "description": "Additional kwargs for attention processors.", + }, + "denoiser_input_fields": { + "kwargs_type": "denoiser_input_fields", + "type_hint": torch.Tensor, + "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", + }, + "control_guidance_start": { + "type_hint": float, + "default": 0.0, + "description": "When to start applying ControlNet.", + }, + "control_guidance_end": { + "type_hint": float, + "default": 1.0, + "description": "When to stop applying ControlNet.", + }, + "controlnet_conditioning_scale": { + "type_hint": float, + "default": 1.0, + "description": "Scale for ControlNet conditioning.", + }, +} + +OUTPUT_PARAM_TEMPLATES = { + "images": { + "type_hint": List[PIL.Image.Image], + "description": "Generated images.", + }, + "latents": { + "type_hint": torch.Tensor, + "description": "Denoised latents.", + }, +} + + # YiYi Notes: both inputs and intermediate_inputs are InputParam objects # however some fields are not relevant for intermediate_inputs # e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed @@ -344,190 +471,22 @@ def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" @classmethod - def template(cls, name: str) -> Optional["InputParam"]: - """Get template for name if exists, otherwise None.""" - if hasattr(cls, name) and callable(getattr(cls, name)): - return getattr(cls, name)() - return None - - # ====================================================== - # InputParam templates - # ====================================================== - - @classmethod - def prompt(cls) -> "InputParam": - return cls( - name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation." - ) - - @classmethod - def negative_prompt(cls) -> "InputParam": - return cls( - name="negative_prompt", - type_hint=str, - default=None, - description="The prompt or prompts not to guide the image generation.", - ) - - @classmethod - def max_sequence_length(cls, default: int = 512) -> "InputParam": - return cls( - name="max_sequence_length", - type_hint=int, - default=default, - description="Maximum sequence length for prompt encoding.", - ) - - @classmethod - def height(cls, default: Optional[int] = None) -> "InputParam": - return cls( - name="height", type_hint=int, default=default, description="The height in pixels of the generated image." - ) - - @classmethod - def width(cls, default: Optional[int] = None) -> "InputParam": - return cls( - name="width", type_hint=int, default=default, description="The width in pixels of the generated image." - ) - - @classmethod - def num_inference_steps(cls, default: int = 50) -> "InputParam": - return cls( - name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps." - ) - - @classmethod - def num_images_per_prompt(cls, default: int = 1) -> "InputParam": - return cls( - name="num_images_per_prompt", - type_hint=int, - default=default, - description="The number of images to generate per prompt.", - ) - - @classmethod - def generator(cls) -> "InputParam": - return cls( - name="generator", - type_hint=torch.Generator, - default=None, - description="Torch generator for deterministic generation.", - ) - - @classmethod - def sigmas(cls) -> "InputParam": - return cls( - name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process." - ) - - @classmethod - def strength(cls, default: float = 0.9) -> "InputParam": - return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.") - - # images - @classmethod - def image(cls) -> "InputParam": - return cls( - name="image", - type_hint=PIL.Image.Image, - required=True, - description="Input image for img2img, editing, or conditioning.", - ) - - @classmethod - def mask_image(cls) -> "InputParam": - return cls( - name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting." - ) - - @classmethod - def control_image(cls) -> "InputParam": - return cls( - name="control_image", - type_hint=PIL.Image.Image, - required=True, - description="Control image for ControlNet conditioning.", - ) - - @classmethod - def padding_mask_crop(cls) -> "InputParam": - return cls( - name="padding_mask_crop", - type_hint=int, - default=None, - description="Padding for mask cropping in inpainting.", - ) - - @classmethod - def latents(cls) -> "InputParam": - return cls( - name="latents", - type_hint=torch.Tensor, - default=None, - description="Pre-generated noisy latents for image generation.", - ) - - @classmethod - def timesteps(cls) -> "InputParam": - return cls( - name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process." - ) - - @classmethod - def output_type(cls) -> "InputParam": - return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.") - - @classmethod - def attention_kwargs(cls) -> "InputParam": - return cls( - name="attention_kwargs", - type_hint=Dict[str, Any], - default=None, - description="Additional kwargs for attention processors.", - ) - - @classmethod - def denoiser_input_fields(cls) -> "InputParam": - return cls( - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ) - - # ControlNet - @classmethod - def control_guidance_start(cls, default: float = 0.0) -> "InputParam": - return cls( - name="control_guidance_start", - type_hint=float, - default=default, - description="When to start applying ControlNet.", - ) - - @classmethod - def control_guidance_end(cls, default: float = 1.0) -> "InputParam": - return cls( - name="control_guidance_end", - type_hint=float, - default=default, - description="When to stop applying ControlNet.", - ) - - @classmethod - def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam": - return cls( - name="controlnet_conditioning_scale", - type_hint=float, - default=default, - description="Scale for ControlNet conditioning.", - ) + def template(cls, name: str, **overrides) -> "InputParam": + """Get template for name if exists, otherwise return basic InputParam with just the name.""" + if name in INPUT_PARAM_TEMPLATES: + kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]} + # Override with user-provided values + for key, value in overrides.items(): + kwargs[key] = value + return cls(**kwargs) + return cls(name=name, **overrides) @dataclass class OutputParam: """Specification for an output parameter.""" - name: str + name: str = None type_hint: Any = None description: str = "" kwargs_type: str = None # YiYi notes: remove this feature (maybe) @@ -538,23 +497,15 @@ def __repr__(self): ) @classmethod - def template(cls, name: str) -> Optional["OutputParam"]: - """Get template for name if exists, otherwise None.""" - if hasattr(cls, name) and callable(getattr(cls, name)): - return getattr(cls, name)() - return None - - # ====================================================== - # OutputParam templates - # ====================================================== - - @classmethod - def images(cls) -> "OutputParam": - return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.") - - @classmethod - def latents(cls) -> "OutputParam": - return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.") + def template(cls, name: str, **overrides) -> "OutputParam": + """Get template for name if exists, otherwise return basic OutputParam with just the name.""" + if name in OUTPUT_PARAM_TEMPLATES: + kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]} + # Override with user-provided values + for key, value in overrides.items(): + kwargs[key] = value + return cls(**kwargs) + return cls(name=name, **overrides) def format_inputs_short(inputs): @@ -890,4 +841,4 @@ def make_doc_string( output += "\n\n" output += format_output_params(outputs, indent_level=2) - return output + return output \ No newline at end of file From 002c3e8239b267e17b3849d1e53fde78890f0ad1 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 03:24:34 +0100 Subject: [PATCH 14/23] add template method --- .../modular_pipeline_utils.py | 163 ++++++++++++------ 1 file changed, 112 insertions(+), 51 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index f8dde1fbd096..a65aa43b2a3b 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -336,7 +336,6 @@ class ConfigSpec: }, "negative_prompt": { "type_hint": str, - "default": None, "description": "The prompt or prompts not to guide the image generation.", }, "max_sequence_length": { @@ -364,12 +363,10 @@ class ConfigSpec: }, "generator": { "type_hint": torch.Generator, - "default": None, "description": "Torch generator for deterministic generation.", }, "sigmas": { "type_hint": List[float], - "default": None, "description": "Custom sigmas for the denoising process.", }, "strength": { @@ -378,33 +375,16 @@ class ConfigSpec: "description": "Strength for img2img/inpainting.", }, "image": { - "type_hint": PIL.Image.Image, + "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]], "required": True, - "description": "Input image for img2img, editing, or conditioning.", - }, - "mask_image": { - "type_hint": PIL.Image.Image, - "required": True, - "description": "Mask image for inpainting.", - }, - "control_image": { - "type_hint": PIL.Image.Image, - "required": True, - "description": "Control image for ControlNet conditioning.", - }, - "padding_mask_crop": { - "type_hint": int, - "default": None, - "description": "Padding for mask cropping in inpainting.", + "description": "Reference image(s) for denoising. Can be a single image or list of images.", }, "latents": { "type_hint": torch.Tensor, - "default": None, "description": "Pre-generated noisy latents for image generation.", }, "timesteps": { "type_hint": torch.Tensor, - "default": None, "description": "Timesteps for the denoising process.", }, "output_type": { @@ -414,14 +394,28 @@ class ConfigSpec: }, "attention_kwargs": { "type_hint": Dict[str, Any], - "default": None, "description": "Additional kwargs for attention processors.", }, "denoiser_input_fields": { "kwargs_type": "denoiser_input_fields", - "type_hint": torch.Tensor, "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", }, + # inpainting + "mask_image": { + "type_hint": PIL.Image.Image, + "required": True, + "description": "Mask image for inpainting.", + }, + "padding_mask_crop": { + "type_hint": int, + "description": "Padding for mask cropping in inpainting.", + }, + # controlnet + "control_image": { + "type_hint": PIL.Image.Image, + "required": True, + "description": "Control image for ControlNet conditioning.", + }, "control_guidance_start": { "type_hint": float, "default": 0.0, @@ -437,6 +431,45 @@ class ConfigSpec: "default": 1.0, "description": "Scale for ControlNet conditioning.", }, + "layers": { + "type_hint": int, + "default": 4, + "description": "Number of layers to extract from the image", + }, + # common intermediate inputs + "prompt_embeds":{ + "type_hint": torch.Tensor, + "required": True, + "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.", + }, + "prompt_embeds_mask": { + "type_hint": torch.Tensor, + "required": True, + "description": "mask for the text embeddings. Can be generated from text_encoder step.", + }, + "negative_prompt_embeds": { + "type_hint": torch.Tensor, + "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.", + }, + "negative_prompt_embeds_mask": { + "type_hint": torch.Tensor, + "description": "mask for the negative text embeddings. Can be generated from text_encoder step.", + }, + "image_latents": { + "type_hint": torch.Tensor, + "required": True, + "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.", + }, + "batch_size": { + "type_hint": int, + "default": 1, + "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.", + }, + "dtype": { + "type_hint": torch.dtype, + "default": torch.float32, + "description": "The dtype of the model inputs, can be generated in input step.", + }, } OUTPUT_PARAM_TEMPLATES = { @@ -448,15 +481,34 @@ class ConfigSpec: "type_hint": torch.Tensor, "description": "Denoised latents.", }, + # intermediate outputs + "prompt_embeds": { + "type_hint": torch.Tensor, + "kwargs_type": "denoiser_input_fields", + "description": "The prompt embeddings.", + }, + "prompt_embeds_mask": { + "type_hint": torch.Tensor, + "kwargs_type": "denoiser_input_fields", + "description": "The encoder attention mask.", + }, + "negative_prompt_embeds": { + "type_hint": torch.Tensor, + "kwargs_type": "denoiser_input_fields", + "description": "The negative prompt embeddings.", + }, + "negative_prompt_embeds_mask": { + "type_hint": torch.Tensor, + "kwargs_type": "denoiser_input_fields", + "description": "The negative prompt embeddings mask.", + }, + "image_latents": { + "type_hint": torch.Tensor, + "description": "The latent representation of the input image.", + }, } -# YiYi Notes: both inputs and intermediate_inputs are InputParam objects -# however some fields are not relevant for intermediate_inputs -# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed -# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs -# -> should we use different class for inputs and intermediate_inputs? -@dataclass class InputParam: """Specification for an input parameter.""" @@ -465,31 +517,37 @@ class InputParam: default: Any = None required: bool = False description: str = "" - kwargs_type: str = None # YiYi Notes: remove this feature (maybe) + kwargs_type: str = None + + def __post_init__(self): + if self.required and self.default is not None: + raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value") def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" @classmethod - def template(cls, name: str, **overrides) -> "InputParam": - """Get template for name if exists, otherwise return basic InputParam with just the name.""" - if name in INPUT_PARAM_TEMPLATES: - kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]} - # Override with user-provided values - for key, value in overrides.items(): - kwargs[key] = value - return cls(**kwargs) - return cls(name=name, **overrides) + def template(cls, name: str, note: str = None, **overrides) -> "InputParam": + """Get template for name if exists, otherwise raise ValueError.""" + if name not in INPUT_PARAM_TEMPLATES: + raise ValueError(f"InputParam template for {name} not found") + + template_kwargs = INPUT_PARAM_TEMPLATES[name].copy() + + if note and "description" in template_kwargs: + template_kwargs["description"] = f"{template_kwargs['description']} ({note})" + + template_kwargs.update(overrides) + return cls(name=name, **template_kwargs) -@dataclass class OutputParam: """Specification for an output parameter.""" name: str = None type_hint: Any = None description: str = "" - kwargs_type: str = None # YiYi notes: remove this feature (maybe) + kwargs_type: str = None def __repr__(self): return ( @@ -497,15 +555,18 @@ def __repr__(self): ) @classmethod - def template(cls, name: str, **overrides) -> "OutputParam": - """Get template for name if exists, otherwise return basic OutputParam with just the name.""" - if name in OUTPUT_PARAM_TEMPLATES: - kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]} - # Override with user-provided values - for key, value in overrides.items(): - kwargs[key] = value - return cls(**kwargs) - return cls(name=name, **overrides) + def template(cls, name: str, note: str = None, **overrides) -> "OutputParam": + """Get template for name if exists, otherwise raise ValueError.""" + if name not in OUTPUT_PARAM_TEMPLATES: + raise ValueError(f"OutputParam template for {name} not found") + + template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy() + + if note and "description" in template_kwargs: + template_kwargs["description"] = f"{template_kwargs['description']} ({note})" + + template_kwargs.update(overrides) + return cls(name=name, **template_kwargs) def format_inputs_short(inputs): From 1f2dbc9dd2bf4d256039120f6d6ccaf49f1c09c7 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 04:10:17 +0100 Subject: [PATCH 15/23] up --- .../qwenimage/before_denoise.py | 187 +++---- .../modular_pipelines/qwenimage/decoders.py | 71 +-- .../modular_pipelines/qwenimage/denoise.py | 125 +---- .../modular_pipelines/qwenimage/encoders.py | 509 ++++++++---------- .../modular_pipelines/qwenimage/inputs.py | 282 +++++++--- .../qwenimage/modular_blocks_qwenimage.py | 61 ++- .../modular_blocks_qwenimage_edit.py | 39 +- .../modular_blocks_qwenimage_edit_plus.py | 30 +- .../modular_blocks_qwenimage_layered.py | 73 ++- 9 files changed, 677 insertions(+), 700 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index cb808b1d3807..b87c3555aad3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -134,28 +134,20 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.latents(), - InputParam.height(), - InputParam.width(), - InputParam.num_images_per_prompt(), - InputParam.generator(), - InputParam( - name="batch_size", - required=True, - type_hint=int, - description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.", - ), - InputParam( - name="dtype", - required=True, - type_hint=torch.dtype, - description="The dtype of the model inputs, can be generated in input step.", - ), + InputParam.template("latents"), + InputParam.template("height"), + InputParam.template("width"), + InputParam.template("num_images_per_prompt"), + InputParam.template("generator"), + InputParam.template("batch_size"), + InputParam.template("dtype"), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ + OutputParam(name="height", type_hint=int, description="updated to default value if not provided"), + OutputParam(name="width", type_hint=int, description="updated to default value if not provided"), OutputParam( name="latents", type_hint=torch.Tensor, @@ -225,31 +217,21 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.latents(), - InputParam.height(), - InputParam.width(), - InputParam( - name="layers", type_hint=int, default=4, description="Number of layers to extract from the image" - ), - InputParam.num_images_per_prompt(), - InputParam.generator(), - InputParam( - name="batch_size", - required=True, - type_hint=int, - description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.", - ), - InputParam( - name="dtype", - required=True, - type_hint=torch.dtype, - description="The dtype of the model inputs, can be generated in input step.", - ), + InputParam.template("latents"), + InputParam.template("height"), + InputParam.template("width"), + InputParam.template("layers"), + InputParam.template("num_images_per_prompt"), + InputParam.template("generator"), + InputParam.template("batch_size"), + InputParam.template("dtype"), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ + OutputParam(name="height", type_hint=int, description="updated to default value if not provided"), + OutputParam(name="width", type_hint=int, description="updated to default value if not provided"), OutputParam( name="latents", type_hint=torch.Tensor, @@ -325,18 +307,8 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The initial random noised, can be generated in prepare latent step.", ), - InputParam( - name="image_latents", - required=True, - type_hint=torch.Tensor, - description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.", - ), - InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", - ), + InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), + InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."), ] @property @@ -347,6 +319,11 @@ def intermediate_outputs(self) -> List[OutputParam]: type_hint=torch.Tensor, description="The initial random noised used for inpainting denoising.", ), + OutputParam( + name="latents", + type_hint=torch.Tensor, + description="The scalednoisy latents to use for inpainting/image-to-image denoising.", + ), ] @staticmethod @@ -406,9 +383,9 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The processed mask to use for the inpainting process.", ), - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam(name="dtype", required=True), + InputParam.template("height", required=True, note="should be updated in prepare latents step."), + InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("dtype"), ] @property @@ -468,14 +445,9 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.num_inference_steps(), - InputParam.sigmas(), - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process, used to calculate the image sequence length.", - ), + InputParam.template("num_inference_steps"), + InputParam.template("sigmas"), + InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."), ] @property @@ -484,6 +456,7 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process" ), + OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -534,15 +507,16 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.num_inference_steps(), - InputParam.sigmas(), - InputParam("image_latents", required=True, type_hint=torch.Tensor), + InputParam.template("num_inference_steps"), + InputParam.template("sigmas"), + InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="timesteps", type_hint=torch.Tensor), + OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"), + OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"), ] @torch.no_grad() @@ -592,15 +566,10 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.num_inference_steps(), - InputParam.sigmas(), - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process, used to calculate the image sequence length.", - ), - InputParam.strength(0.9), + InputParam.template("num_inference_steps"), + InputParam.template("sigmas"), + InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."), + InputParam.template("strength", default=0.9), ] @property @@ -609,7 +578,12 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( name="timesteps", type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + description="The timesteps to use for the denoising process.", + ), + OutputParam( + name="num_inference_steps", + type_hint=int, + description="The number of denoising steps to perform at inference time", ), ] @@ -668,11 +642,11 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="batch_size", required=True), - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam(name="prompt_embeds_mask"), - InputParam(name="negative_prompt_embeds_mask"), + InputParam.template("batch_size"), + InputParam.template("height", note="should be updated in prepare latents step."), + InputParam.template("width", note="should be updated in prepare latents step."), + InputParam.template("prompt_embeds_mask"), + InputParam.template("negative_prompt_embeds_mask"), ] @property @@ -734,13 +708,13 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="batch_size", required=True), - InputParam(name="image_height", required=True), - InputParam(name="image_width", required=True), - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam(name="prompt_embeds_mask"), - InputParam(name="negative_prompt_embeds_mask"), + InputParam.template("batch_size"), + InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."), + InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."), + InputParam.template("height", required=True, note="should be updated in prepare latents step."), + InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("prompt_embeds_mask"), + InputParam.template("negative_prompt_embeds_mask"), ] @property @@ -813,13 +787,13 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="batch_size", required=True), - InputParam(name="image_height", required=True, type_hint=List[int]), - InputParam(name="image_width", required=True, type_hint=List[int]), - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam(name="prompt_embeds_mask"), - InputParam(name="negative_prompt_embeds_mask"), + InputParam.template("batch_size"), + InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."), + InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."), + InputParam.template("height", required=True, note="should be updated in prepare latents step."), + InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("prompt_embeds_mask"), + InputParam.template("negative_prompt_embeds_mask"), ] @property @@ -887,12 +861,12 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="batch_size", required=True), - InputParam(name="layers", default=4, description="Number of layers to extract from the image"), - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam(name="prompt_embeds_mask"), - InputParam(name="negative_prompt_embeds_mask"), + InputParam.template("batch_size"), + InputParam.template("layers"), + InputParam.template("height", required=True, note="should be updated in prepare latents step."), + InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("prompt_embeds_mask"), + InputParam.template("negative_prompt_embeds_mask"), ] @property @@ -973,16 +947,11 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam.control_guidance_start(), - InputParam.control_guidance_end(), - InputParam.controlnet_conditioning_scale(), - InputParam("control_image_latents", required=True), - InputParam( - "timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", - ), + InputParam.template("control_guidance_start"), + InputParam.template("control_guidance_end"), + InputParam.template("controlnet_conditioning_scale"), + InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."), + InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."), ] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 8207e99b69ae..499f0172888b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -47,14 +47,15 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="height", required=True), - InputParam(name="width", required=True), - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to decode, can be generated in the denoise step", - ), + InputParam.template("height", required=True, note="should be updated in input and prepare latents step."), + InputParam.template("width", required=True, note="should be updated in input and prepare latents step."), + InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam.template("latents", note="unpacked to B, C, 1, H, W"), ] @torch.no_grad() @@ -86,10 +87,16 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("height", required=True, type_hint=int), - InputParam("width", required=True, type_hint=int), - InputParam("layers", default=4, description="Number of layers to extract from the image"), + InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."), + InputParam.template("height", required=True, note="should be updated in prepare latents step."), + InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("layers"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"), ] @torch.no_grad() @@ -128,17 +135,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to decode, can be generated in the denoise step", - ), + InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."), ] @property - def intermediate_outputs(self) -> List[str]: - return [OutputParam.images()] + def intermediate_outputs(self) -> List[OutputParam]: + return [OutputParam.template("images", note="tensor output of the vae decoder.")] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -190,19 +192,14 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam( - "latents", - required=True, - type_hint=torch.Tensor, - description="The latents to decode, can be generated in the denoise step", - ), - InputParam.output_type(), + InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."), + InputParam.template("output_type"), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam.images(), + OutputParam.template("images"), ] @torch.no_grad() @@ -269,10 +266,14 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("images", required=True, description="the generated image from decoders step"), - InputParam.output_type(), + InputParam("images", required=True, description="the generated image tensor from decoders step"), + InputParam.template("output_type"), ] + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [OutputParam.template("images")] + @staticmethod def check_inputs(output_type): if output_type not in ["pil", "np", "pt"]: @@ -314,11 +315,15 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("images", required=True, description="the generated image from decoders step"), - InputParam.output_type(), - InputParam("mask_overlay_kwargs"), + InputParam("images", required=True, description="the generated image tensor from decoders step"), + InputParam.template("output_type"), + InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."), ] + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [OutputParam.template("images")] + @staticmethod def check_inputs(output_type, mask_overlay_kwargs): if output_type not in ["pil", "np", "pt"]: diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 472945b2269a..49fde3fd6ac3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -49,12 +49,7 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam( - "latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", - ), + InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."), ] @torch.no_grad() @@ -79,18 +74,8 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam( - "latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", - ), - InputParam( - "image_latents", - required=True, - type_hint=torch.Tensor, - description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.", - ), + InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."), + InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."), ] @torch.no_grad() @@ -134,30 +119,10 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", ), - InputParam( - "controlnet_conditioning_scale", - type_hint=float, - description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", - ), - InputParam( - "controlnet_keep", - required=True, - type_hint=List[float], - description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", - ), - InputParam( - "num_inference_steps", - required=True, - type_hint=int, - description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", - ), - InputParam( - kwargs_type="denoiser_input_fields", - description=( - "All conditional model inputs for the denoiser. " - "It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens." - ), - ), + InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."), + InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."), + InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."), + InputParam.template("denoiser_input_fields") ] @torch.no_grad() @@ -218,25 +183,15 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.attention_kwargs(), - InputParam( - "latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare_latents step.", - ), - InputParam( - "num_inference_steps", - required=True, - type_hint=int, - description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", - ), - InputParam.denoiser_input_fields(), + InputParam.template("attention_kwargs"), + InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."), + InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), + InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", required=True, type_hint=List[Tuple[int, int]], - description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.", + description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.", ), ] @@ -319,20 +274,10 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.attention_kwargs(), - InputParam( - "latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare_latents step.", - ), - InputParam( - "num_inference_steps", - required=True, - type_hint=int, - description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", - ), - InputParam.denoiser_input_fields(), + InputParam.template("attention_kwargs"), + InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."), + InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), + InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", required=True, @@ -418,7 +363,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @torch.no_grad() @@ -459,24 +404,14 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam( - "image_latents", - required=True, - type_hint=torch.Tensor, - description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.", - ), + InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), InputParam( "initial_noise", required=True, type_hint=torch.Tensor, description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam( - "timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", - ), + InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."), ] @torch.no_grad() @@ -517,18 +452,8 @@ def loop_expected_components(self) -> List[ComponentSpec]: @property def loop_inputs(self) -> List[InputParam]: return [ - InputParam( - "timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", - ), - InputParam( - "num_inference_steps", - required=True, - type_hint=int, - description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", - ), + InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."), + InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), ] @torch.no_grad() @@ -560,6 +485,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # Qwen Image (text2image, image2image) + +# auto_docstring class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" @@ -584,6 +511,7 @@ def description(self) -> str: # Qwen Image (inpainting) +# auto_docstring class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" block_classes = [ @@ -609,6 +537,7 @@ def description(self) -> str: # Qwen Image (text2image, image2image) with controlnet +# auto_docstring class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" block_classes = [ @@ -634,6 +563,7 @@ def description(self) -> str: # Qwen Image (inpainting) with controlnet +# auto_docstring class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage" block_classes = [ @@ -667,6 +597,7 @@ def description(self) -> str: # Qwen Image Edit (image2image) +# auto_docstring class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage-edit" block_classes = [ @@ -690,6 +621,7 @@ def description(self) -> str: # Qwen Image Edit (inpainting) +# auto_docstring class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage-edit" block_classes = [ @@ -715,6 +647,7 @@ def description(self) -> str: # Qwen Image Layered (image2image) +# auto_docstring class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): model_name = "qwenimage-layered" block_classes = [ diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 8d7b1905423d..82a3b6811959 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -259,33 +259,30 @@ def encode_vae_image( # ==================== # 1. RESIZE # ==================== +# In QwenImage pipelines, resize is a separate step because the resized image is used in VL encoding and vae encoder blocks: +# +# image (PIL.Image.Image) +# │ +# ▼ +# resized_image ([PIL.Image.Image]) +# │ +# ├──► text_encoder ──► prompt_embeds, prompt_embeds_mask +# │ (VL encoding needs the resized image for vision-language fusion) +# │ +# └──► image_processor ──► processed_image (torch.Tensor, pixel space) +# │ +# ▼ +# vae_encoder ──► image_latents (torch.Tensor, latent space) +# +# In most of our other pipelines, resizing is done as part of the image preprocessing step. +# ==================== class QwenImageEditResizeStep(ModularPipelineBlocks): model_name = "qwenimage-edit" - def __init__( - self, - input_name: str = "image", - output_name: str = "resized_image", - ): - """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. - - Args: - input_name (str, optional): Name of the image field to read from the - pipeline state. Defaults to "image". - output_name (str, optional): Name of the resized image field to write - back to the pipeline state. Defaults to "resized_image". - """ - if not isinstance(input_name, str) or not isinstance(output_name, str): - raise ValueError( - f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" - ) - self._image_input_name = input_name - self._resized_image_output_name = output_name - super().__init__() @property def description(self) -> str: - return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio." + return "Image Resize step that resize the image to target area while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -300,21 +297,15 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [ - InputParam.template(self._image_input_name) - or InputParam( - name=self._image_input_name, - required=True, - type_hint=torch.Tensor, - description="Input image for conditioning", - ), - ] + return [InputParam.template("image")] @property def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" + name="resized_image", + type_hint=List[PIL.Image.Image], + description="The resized images", ), ] @@ -322,7 +313,7 @@ def intermediate_outputs(self) -> List[OutputParam]: def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - images = getattr(block_state, self._image_input_name) + images = block_state.image if not is_valid_image_imagelist(images): raise ValueError(f"Images must be image or list of images but are {type(images)}") @@ -338,7 +329,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): for image in images ] - setattr(block_state, self._resized_image_output_name, resized_images) + block_state.resized_image = resized_images self.set_block_state(state, block_state) return components, state @@ -346,30 +337,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): class QwenImageLayeredResizeStep(ModularPipelineBlocks): model_name = "qwenimage-layered" - def __init__( - self, - input_name: str = "image", - output_name: str = "resized_image", - ): - """Create a configurable step for resizing images to the target area while maintaining the aspect ratio. - - Args: - input_name (str, optional): Name of the image field to read from the - pipeline state. Defaults to "image". - output_name (str, optional): Name of the resized image field to write - back to the pipeline state. Defaults to "resized_image". - """ - if not isinstance(input_name, str) or not isinstance(output_name, str): - raise ValueError( - f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" - ) - self._image_input_name = input_name - self._resized_image_output_name = output_name - super().__init__() - @property def description(self) -> str: - return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio." + return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -385,10 +355,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) - or InputParam( - name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" - ), + InputParam.template("image"), InputParam( name="resolution", default=640, @@ -399,11 +366,11 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: - return [ - OutputParam( - name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" - ), - ] + return [OutputParam( + name="resized_image", + type_hint=List[PIL.Image.Image], + description="The resized images", + )] @staticmethod def check_inputs(resolution: int): @@ -416,7 +383,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): self.check_inputs(resolution=block_state.resolution) - images = getattr(block_state, self._image_input_name) + images = block_state.image if not is_valid_image_imagelist(images): raise ValueError(f"Images must be image or list of images but are {type(images)}") @@ -433,45 +400,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): for image in images ] - setattr(block_state, self._resized_image_output_name, resized_images) + block_state.resized_image = resized_images self.set_block_state(state, block_state) return components, state class QwenImageEditPlusResizeStep(ModularPipelineBlocks): - """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus.""" model_name = "qwenimage-edit-plus" - def __init__( - self, - input_name: str = "image", - output_name: str = "resized_image", - target_area: int = 1024 * 1024, - ): - """Create a step for resizing images to a target area. - - Each image is resized independently based on its own aspect ratio. This is suitable for Edit Plus where - multiple reference images can have different dimensions. - - Args: - input_name (str, optional): Name of the image field to read. Defaults to "image". - output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image". - target_area (int, optional): Target area in pixels. Defaults to 1024*1024. - """ - if not isinstance(input_name, str) or not isinstance(output_name, str): - raise ValueError( - f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" - ) - self._image_input_name = input_name - self._resized_image_output_name = output_name - self._target_area = target_area - super().__init__() - @property def description(self) -> str: return ( - f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n" + "Resize images for QwenImage Edit Plus pipeline.\n" + "Produces two outputs: resized_image (1024x1024) for VAE encoding, " + "resized_cond_image (384x384) for VL text encoding.\n" "Each image is resized independently based on its own aspect ratio." ) @@ -488,21 +431,21 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [ - InputParam.template(self._image_input_name) - or InputParam( - name=self._image_input_name, - required=True, - type_hint=torch.Tensor, - description="The image(s) to resize", - ), - ] + # image + return [InputParam.template("image")] @property def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" + name="resized_image", + type_hint=List[PIL.Image.Image], + description="Images resized to 1024x1024 target area for VAE encoding", + ), + OutputParam( + name="resized_cond_image", + type_hint=List[PIL.Image.Image], + description="Images resized to 384x384 target area for VL text encoding", ), ] @@ -510,7 +453,7 @@ def intermediate_outputs(self) -> List[OutputParam]: def __call__(self, components: QwenImageModularPipeline, state: PipelineState): block_state = self.get_block_state(state) - images = getattr(block_state, self._image_input_name) + images = block_state.image if not is_valid_image_imagelist(images): raise ValueError(f"Images must be image or list of images but are {type(images)}") @@ -520,16 +463,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # Resize each image independently based on its own aspect ratio resized_images = [] + resized_cond_images = [] for image in images: image_width, image_height = image.size - calculated_width, calculated_height, _ = calculate_dimensions( - self._target_area, image_width / image_height - ) + + # For VAE encoder (1024x1024 target area) + vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) resized_images.append( - components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) + components.image_resize_processor.resize(image, height=vae_height, width=vae_width) + ) + + # For VL text encoder (384x384 target area) + vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height) + resized_cond_images.append( + components.image_resize_processor.resize(image, height=vl_height, width=vl_width) ) - setattr(block_state, self._resized_image_output_name, resized_images) + block_state.resized_image = resized_images + block_state.resized_cond_image = resized_cond_images self.set_block_state(state, block_state) return components, state @@ -538,13 +489,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # 2. GET IMAGE PROMPT # ==================== class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): - """ - Auto-caption step that generates a text prompt from the input image if none is provided. Uses the VL model to - generate a description of the image. - """ model_name = "qwenimage-layered" + def __init__(self): + self.image_caption_prompt_en = QWENIMAGE_LAYERED_CAPTION_PROMPT_EN + self.image_caption_prompt_cn = QWENIMAGE_LAYERED_CAPTION_PROMPT_CN + super().__init__() + @property def description(self) -> str: return ( @@ -560,19 +512,10 @@ def expected_components(self) -> List[ComponentSpec]: ComponentSpec("processor", Qwen2VLProcessor), ] - @property - def expected_configs(self) -> List[ConfigSpec]: - return [ - ConfigSpec(name="image_caption_prompt_en", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_EN), - ConfigSpec(name="image_caption_prompt_cn", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_CN), - ] - @property def inputs(self) -> List[InputParam]: return [ - InputParam( - name="prompt", type_hint=str, description="The prompt to encode" - ), # it is not required for qwenimage-layered, unlike other pipelines + InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -596,9 +539,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # If prompt is empty or None, generate caption from image if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ": if block_state.use_en_prompt: - caption_prompt = components.config.image_caption_prompt_en + caption_prompt = self.image_caption_prompt_en else: - caption_prompt = components.config.image_caption_prompt_cn + caption_prompt = self.image_caption_prompt_cn model_inputs = components.processor( text=caption_prompt, @@ -627,6 +570,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageTextEncoderStep(ModularPipelineBlocks): model_name = "qwenimage" + def __init__(self): + self.prompt_template_encode = QWENIMAGE_PROMPT_TEMPLATE + self.prompt_template_encode_start_idx = QWENIMAGE_PROMPT_TEMPLATE_START_IDX + self.tokenizer_max_length = 1024 + super().__init__() + @property def description(self) -> str: return "Text Encoder step that generates text embeddings to guide the image generation." @@ -644,49 +593,22 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property - def expected_configs(self) -> List[ConfigSpec]: - return [ - ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_PROMPT_TEMPLATE), - ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_PROMPT_TEMPLATE_START_IDX), - ConfigSpec(name="tokenizer_max_length", default=1024), - ] @property def inputs(self) -> List[InputParam]: return [ - InputParam.prompt(), - InputParam.negative_prompt(), - InputParam.max_sequence_length(1024), + InputParam.template("prompt"), + InputParam.template("negative_prompt"), + InputParam.template("max_sequence_length", default=1024), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - name="prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The prompt embeddings", - ), - OutputParam( - name="prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The encoder attention mask", - ), - OutputParam( - name="negative_prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings", - ), - OutputParam( - name="negative_prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings mask", - ), + OutputParam.template("prompt_embeds"), + OutputParam.template("prompt_embeds_mask"), + OutputParam.template("negative_prompt_embeds"), + OutputParam.template("negative_prompt_embeds_mask"), ] @staticmethod @@ -715,9 +637,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.text_encoder, components.tokenizer, prompt=block_state.prompt, - prompt_template_encode=components.config.prompt_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, - tokenizer_max_length=components.config.tokenizer_max_length, + prompt_template_encode=self.prompt_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, + tokenizer_max_length=self.tokenizer_max_length, device=device, ) @@ -732,9 +654,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.text_encoder, components.tokenizer, prompt=negative_prompt, - prompt_template_encode=components.config.prompt_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, - tokenizer_max_length=components.config.tokenizer_max_length, + prompt_template_encode=self.prompt_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, + tokenizer_max_length=self.tokenizer_max_length, device=device, ) block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[ @@ -751,6 +673,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): class QwenImageEditTextEncoderStep(ModularPipelineBlocks): model_name = "qwenimage" + def __init__(self): + self.prompt_template_encode = QWENIMAGE_EDIT_PROMPT_TEMPLATE + self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX + super().__init__() + @property def description(self) -> str: return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation." @@ -768,18 +695,12 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property - def expected_configs(self) -> List[ConfigSpec]: - return [ - ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE), - ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX), - ] @property def inputs(self) -> List[InputParam]: return [ - InputParam.prompt(), - InputParam.negative_prompt(), + InputParam.template("prompt"), + InputParam.template("negative_prompt"), InputParam( name="resized_image", required=True, @@ -791,30 +712,10 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - name="prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The prompt embeddings", - ), - OutputParam( - name="prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The encoder attention mask", - ), - OutputParam( - name="negative_prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings", - ), - OutputParam( - name="negative_prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings mask", - ), + OutputParam.template("prompt_embeds"), + OutputParam.template("prompt_embeds_mask"), + OutputParam.template("negative_prompt_embeds"), + OutputParam.template("negative_prompt_embeds_mask"), ] @staticmethod @@ -842,8 +743,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.processor, prompt=block_state.prompt, image=block_state.resized_image, - prompt_template_encode=components.config.prompt_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + prompt_template_encode=self.prompt_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, device=device, ) @@ -856,8 +757,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.processor, prompt=negative_prompt, image=block_state.resized_image, - prompt_template_encode=components.config.prompt_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + prompt_template_encode=self.prompt_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, device=device, ) @@ -866,10 +767,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): - """Text encoder for QwenImage Edit Plus (VL encoding with multiple images).""" model_name = "qwenimage-edit-plus" + def __init__(self): + self.prompt_template_encode = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE + self.img_template_encode = QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE + self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX + super().__init__() + @property def description(self) -> str: return ( @@ -890,19 +796,12 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property - def expected_configs(self) -> List[ConfigSpec]: - return [ - ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE), - ConfigSpec(name="img_template_encode", default=QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE), - ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX), - ] @property def inputs(self) -> List[InputParam]: return [ - InputParam.prompt(), - InputParam.negative_prompt(), + InputParam.template("prompt"), + InputParam.template("negative_prompt"), InputParam( name="resized_cond_image", required=True, @@ -914,30 +813,10 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - name="prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The prompt embeddings", - ), - OutputParam( - name="prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The encoder attention mask", - ), - OutputParam( - name="negative_prompt_embeds", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings", - ), - OutputParam( - name="negative_prompt_embeds_mask", - kwargs_type="denoiser_input_fields", - type_hint=torch.Tensor, - description="The negative prompt embeddings mask", - ), + OutputParam.template("prompt_embeds"), + OutputParam.template("prompt_embeds_mask"), + OutputParam.template("negative_prompt_embeds"), + OutputParam.template("negative_prompt_embeds_mask"), ] @staticmethod @@ -965,9 +844,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.processor, prompt=block_state.prompt, image=block_state.resized_cond_image, - prompt_template_encode=components.config.prompt_template_encode, - img_template_encode=components.config.img_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + prompt_template_encode=self.prompt_template_encode, + img_template_encode=self.img_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, device=device, ) @@ -981,9 +860,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): components.processor, prompt=negative_prompt, image=block_state.resized_cond_image, - prompt_template_encode=components.config.prompt_template_encode, - img_template_encode=components.config.img_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + prompt_template_encode=self.prompt_template_encode, + img_template_encode=self.img_template_encode, + prompt_template_encode_start_idx=self.prompt_template_encode_start_idx, device=device, ) ) @@ -1016,18 +895,26 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.mask_image(), - InputParam.image(), - InputParam.height(), - InputParam.width(), - InputParam.padding_mask_crop(), + InputParam.template("mask_image"), + InputParam.template("image"), + InputParam.template("height"), + InputParam.template("width"), + InputParam.template("padding_mask_crop"), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="processed_image"), - OutputParam(name="processed_mask_image"), + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ), + OutputParam( + name="processed_mask_image", + type_hint=torch.Tensor, + description="The processed mask image", + ), OutputParam( name="mask_overlay_kwargs", type_hint=Dict, @@ -1088,21 +975,29 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.mask_image(), + InputParam.template("mask_image"), InputParam( - "resized_image", + name="resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step", ), - InputParam.padding_mask_crop(), + InputParam.template("padding_mask_crop"), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="processed_image"), - OutputParam(name="processed_mask_image"), + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image" + ), + OutputParam( + name="processed_mask_image", + type_hint=torch.Tensor, + description="The processed mask image", + ), OutputParam( name="mask_overlay_kwargs", type_hint=Dict, @@ -1151,14 +1046,18 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.image(), - InputParam.height(), - InputParam.width(), + InputParam.template("image"), + InputParam.template("height"), + InputParam.template("width"), ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam(name="processed_image")] + return [OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + )] @staticmethod def check_inputs(height, width, vae_scale_factor): @@ -1209,12 +1108,21 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("resized_image", required=True), + InputParam( + name="resized_image", + required=True, + type_hint=List[PIL.Image.Image], + description="The resized image. should be generated using a resize step", + ), ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam(name="processed_image")] + return [OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + )] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1252,11 +1160,20 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [InputParam("resized_image")] + return [InputParam( + name="resized_image", + required=True, + type_hint=List[PIL.Image.Image], + description="The resized image. should be generated using a resize step", + )] @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam(name="processed_image")] + return [OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + )] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1274,7 +1191,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): processed_images.append( components.image_processor.preprocess(image=img, height=img_height, width=img_width) ) - block_state.processed_image = processed_images + if is_image_list: block_state.processed_image = processed_images else: @@ -1294,8 +1211,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks): def __init__( self, - input_name: str = "processed_image", - output_name: str = "image_latents", + input: Optional[InputParam] = None, + output: Optional[OutputParam] = None, ): """Initialize a VAE encoder step for converting images to latent representations. @@ -1303,11 +1220,24 @@ def __init__( a single tensor, outputs a single latent tensor. Args: - input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image". - output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents". + input (InputParam, optional): Input parameter for the processed image. Defaults to "processed_image". + output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents". """ - self._image_input_name = input_name - self._image_latents_output_name = output_name + if input is None: + input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode") + + if output is None: + output = OutputParam.template("image_latents") + + if not isinstance(input, InputParam): + raise ValueError(f"input must be InputParam but is {type(input)}") + if not isinstance(output, OutputParam): + raise ValueError(f"output must be OutputParam but is {type(output)}") + + self._input = input + self._output = output + self._image_input_name = input.name + self._image_latents_output_name = output.name super().__init__() @property @@ -1324,20 +1254,13 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) - or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"), - InputParam.generator(), + self._input, # default is "processed_image" + InputParam.template("generator"), ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [ - OutputParam( - self._image_latents_output_name, - type_hint=torch.Tensor, - description="The latents representing the reference image(s). Single tensor or list depending on input.", - ) - ] + return [self._output] # default is "image_latents" @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -1398,10 +1321,10 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam.control_image(), - InputParam.height(), - InputParam.width(), - InputParam.generator(), + InputParam.template("control_image"), + InputParam.template("height"), + InputParam.template("width"), + InputParam.template("generator"), ] return inputs @@ -1489,22 +1412,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 6. PERMUTE LATENTS # ==================== class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): - """Permute image latents from VAE format to Layered format.""" - model_name = "qwenimage-layered" - def __init__(self, input_name: str = "image_latents"): - self._input_name = input_name - super().__init__() - @property def description(self) -> str: - return f"Permute {self._input_name} from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." + return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." @property def inputs(self) -> List[InputParam]: return [ - InputParam(self._input_name, required=True), + InputParam.template("image_latents"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam.template("image_latents", note="permuted from [B, C, 1, H, W] to [B, 1, C, H, W]"), ] @torch.no_grad() @@ -1512,8 +1435,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) # Permute: (B, C, 1, H, W) -> (B, 1, C, H, W) - latents = getattr(block_state, self._input_name) - setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4)) + latents = block_state.image_latents + block_state.image_latents = latents.permute(0, 2, 1, 3, 4) self.set_block_state(state, block_state) - return components, state + return components, state \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index e28493ecc369..bd2f79ae7c4c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple +from typing import List, Tuple, Optional import torch @@ -129,26 +129,22 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam.num_images_per_prompt(), - InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"), - InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"), - InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"), - InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"), + InputParam.template("num_images_per_prompt"), + InputParam.template("prompt_embeds"), + InputParam.template("prompt_embeds_mask"), + InputParam.template("negative_prompt_embeds"), + InputParam.template("negative_prompt_embeds_mask"), ] @property - def intermediate_outputs(self) -> List[str]: + def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - "batch_size", - type_hint=int, - description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt", - ), - OutputParam( - "dtype", - type_hint=torch.dtype, - description="Data type of model tensor inputs (determined by `prompt_embeds`)", - ), + OutputParam.template("batch_size"), + OutputParam.template("dtype"), + OutputParam.template("prompt_embeds", note="batch-expanded"), + OutputParam.template("prompt_embeds_mask", note="batch-expanded"), + OutputParam.template("negative_prompt_embeds", note="batch-expanded"), + OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"), ] @staticmethod @@ -228,13 +224,28 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): def __init__( self, - image_latent_inputs: List[str] = ["image_latents"], - additional_batch_inputs: List[str] = [], - ): + image_latent_inputs: Optional[List[InputParam]] = None, + additional_batch_inputs: Optional[List[InputParam]] = None, + ): + # by default, process `image_latents` + if image_latent_inputs is None: + image_latent_inputs = [InputParam.template("image_latents")] + if additional_batch_inputs is None: + additional_batch_inputs = [] + if not isinstance(image_latent_inputs, list): - image_latent_inputs = [image_latent_inputs] + raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}") + else: + for input_param in image_latent_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}") + if not isinstance(additional_batch_inputs, list): - additional_batch_inputs = [additional_batch_inputs] + raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}") + else: + for input_param in additional_batch_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -252,9 +263,9 @@ def description(self) -> str: if self._image_latent_inputs or self._additional_batch_inputs: inputs_info = "\n\nConfigured inputs:" if self._image_latent_inputs: - inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}" if self._additional_batch_inputs: - inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}" placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." @@ -269,23 +280,19 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam.num_images_per_prompt(), - InputParam(name="batch_size", required=True), - InputParam.height(), - InputParam.width(), + InputParam.template("num_images_per_prompt"), + InputParam.template("batch_size"), + InputParam.template("height"), + InputParam.template("width"), ] - - for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) - - for input_name in self._additional_batch_inputs: - inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) + # default is `image_latents` + inputs += self._image_latent_inputs + self._additional_batch_inputs return inputs @property def intermediate_outputs(self) -> List[OutputParam]: - return [ + outputs = [ OutputParam( name="image_height", type_hint=int, @@ -295,14 +302,42 @@ def intermediate_outputs(self) -> List[OutputParam]: name="image_width", type_hint=int, description="The image width calculated from the image latents dimension", - ), + ) ] + # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided + if len(self._image_latent_inputs) > 0: + outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided")) + outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided")) + + # image latent inputs are modified in place (patchified and batch-expanded) + for input_param in self._image_latent_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (patchified and batch-expanded)", + ) + ) + + # additional batch inputs (batch-expanded only) + for input_param in self._additional_batch_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (batch-expanded)", + ) + ) + + return outputs + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) # Process image latent inputs - for image_latent_input_name in self._image_latent_inputs: + for input_param in self._image_latent_inputs: + image_latent_input_name = input_param.name image_latent_tensor = getattr(block_state, image_latent_input_name) if image_latent_tensor is None: continue @@ -331,7 +366,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - setattr(block_state, image_latent_input_name, image_latent_tensor) # Process additional batch inputs (only batch expansion) - for input_name in self._additional_batch_inputs: + for input_param in self._additional_batch_inputs: + input_name = input_param.name input_tensor = getattr(block_state, input_name) if input_tensor is None: continue @@ -356,13 +392,27 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): def __init__( self, - image_latent_inputs: List[str] = ["image_latents"], - additional_batch_inputs: List[str] = [], + image_latent_inputs: Optional[List[InputParam]] = None, + additional_batch_inputs: Optional[List[InputParam]] = None, ): + if image_latent_inputs is None: + image_latent_inputs = [InputParam.template("image_latents")] + if additional_batch_inputs is None: + additional_batch_inputs = [] + if not isinstance(image_latent_inputs, list): - image_latent_inputs = [image_latent_inputs] + raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}") + else: + for input_param in image_latent_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}") + if not isinstance(additional_batch_inputs, list): - additional_batch_inputs = [additional_batch_inputs] + raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}") + else: + for input_param in additional_batch_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -381,9 +431,9 @@ def description(self) -> str: if self._image_latent_inputs or self._additional_batch_inputs: inputs_info = "\n\nConfigured inputs:" if self._image_latent_inputs: - inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}" if self._additional_batch_inputs: - inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}" placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." @@ -398,23 +448,20 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam.num_images_per_prompt(), - InputParam(name="batch_size", required=True), - InputParam.height(), - InputParam.width(), + InputParam.template("num_images_per_prompt"), + InputParam.template("batch_size"), + InputParam.template("height"), + InputParam.template("width"), ] - for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) - - for input_name in self._additional_batch_inputs: - inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) + # default is `image_latents` + inputs += self._image_latent_inputs + self._additional_batch_inputs return inputs @property def intermediate_outputs(self) -> List[OutputParam]: - return [ + outputs = [ OutputParam( name="image_height", type_hint=List[int], @@ -426,12 +473,40 @@ def intermediate_outputs(self) -> List[OutputParam]: description="The image widths calculated from the image latents dimension", ), ] + + # `height`/`width` are updated if any image latent inputs are provided + if len(self._image_latent_inputs) > 0: + outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided")) + outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided")) + + # image latent inputs are modified in place (patchified, concatenated, and batch-expanded) + for input_param in self._image_latent_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (patchified, concatenated, and batch-expanded)", + ) + ) + + # additional batch inputs (batch-expanded only) + for input_param in self._additional_batch_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (batch-expanded)", + ) + ) + + return outputs def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) # Process image latent inputs - for image_latent_input_name in self._image_latent_inputs: + for input_param in self._image_latent_inputs: + image_latent_input_name = input_param.name image_latent_tensor = getattr(block_state, image_latent_input_name) if image_latent_tensor is None: continue @@ -476,7 +551,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - setattr(block_state, image_latent_input_name, packed_image_latent_tensors) # Process additional batch inputs (only batch expansion) - for input_name in self._additional_batch_inputs: + for input_param in self._additional_batch_inputs: + input_name = input_param.name input_tensor = getattr(block_state, input_name) if input_tensor is None: continue @@ -494,8 +570,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -# YiYi TODO: support define config default component from the ModularPipeline level. -# it is same as QwenImageAdditionalInputsStep, but with layered pachifier. +# same as QwenImageAdditionalInputsStep, but with layered pachifier. class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier.""" @@ -503,13 +578,27 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): def __init__( self, - image_latent_inputs: List[str] = ["image_latents"], - additional_batch_inputs: List[str] = [], + image_latent_inputs: Optional[List[InputParam]] = None, + additional_batch_inputs: Optional[List[InputParam]] = None, ): + if image_latent_inputs is None: + image_latent_inputs = [InputParam.template("image_latents")] + if additional_batch_inputs is None: + additional_batch_inputs = [] + if not isinstance(image_latent_inputs, list): - image_latent_inputs = [image_latent_inputs] + raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}") + else: + for input_param in image_latent_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}") + if not isinstance(additional_batch_inputs, list): - additional_batch_inputs = [additional_batch_inputs] + raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}") + else: + for input_param in additional_batch_inputs: + if not isinstance(input_param, InputParam): + raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -527,9 +616,9 @@ def description(self) -> str: if self._image_latent_inputs or self._additional_batch_inputs: inputs_info = "\n\nConfigured inputs:" if self._image_latent_inputs: - inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + inputs_info += f"\n - Image latent inputs: {[p.name for p in self._image_latent_inputs]}" if self._additional_batch_inputs: - inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + inputs_info += f"\n - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}" placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." @@ -544,21 +633,18 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam.num_images_per_prompt(), - InputParam(name="batch_size", required=True), + InputParam.template("num_images_per_prompt"), + InputParam.template("batch_size"), ] + # default is `image_latents` - for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) - - for input_name in self._additional_batch_inputs: - inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) + inputs += self._image_latent_inputs + self._additional_batch_inputs return inputs @property def intermediate_outputs(self) -> List[OutputParam]: - return [ + outputs = [ OutputParam( name="image_height", type_hint=int, @@ -569,15 +655,40 @@ def intermediate_outputs(self) -> List[OutputParam]: type_hint=int, description="The image width calculated from the image latents dimension", ), - OutputParam(name="height", type_hint=int, description="The height of the image output"), - OutputParam(name="width", type_hint=int, description="The width of the image output"), ] + if len(self._image_latent_inputs) > 0: + outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided")) + outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided")) + + # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded) + for input_param in self._image_latent_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (patchified with layered pachifier and batch-expanded)", + ) + ) + + # Add outputs for additional batch inputs (batch-expanded only) + for input_param in self._additional_batch_inputs: + outputs.append( + OutputParam( + name=input_param.name, + type_hint=input_param.type_hint, + description=input_param.description + " (batch-expanded)", + ) + ) + + return outputs + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) # Process image latent inputs - for image_latent_input_name in self._image_latent_inputs: + for input_param in self._image_latent_inputs: + image_latent_input_name = input_param.name image_latent_tensor = getattr(block_state, image_latent_input_name) if image_latent_tensor is None: continue @@ -608,7 +719,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - setattr(block_state, image_latent_input_name, image_latent_tensor) # Process additional batch inputs (only batch expansion) - for input_name in self._additional_batch_inputs: + for input_param in self._additional_batch_inputs: + input_name = input_param.name input_tensor = getattr(block_state, input_name) if input_tensor is None: continue @@ -636,11 +748,19 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="control_image_latents", required=True), - InputParam(name="batch_size", required=True), - InputParam.num_images_per_prompt(), - InputParam.height(), - InputParam.width(), + InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."), + InputParam.template("batch_size"), + InputParam.template("num_images_per_prompt"), + InputParam.template("height"), + InputParam.template("width"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."), + OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"), + OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 645c01f66ee5..42593a93f98a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -75,8 +75,11 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -400,8 +403,7 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -440,8 +442,7 @@ def description(self) -> str: # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs - (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: @@ -478,7 +479,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -519,8 +520,7 @@ def outputs(self): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint - task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: @@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -606,8 +606,7 @@ def outputs(self): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img - task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: @@ -648,7 +647,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -691,8 +690,7 @@ def outputs(self): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs - (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: @@ -742,6 +740,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. + denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: latents (`Tensor`): @@ -785,8 +785,7 @@ def outputs(self): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint - task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: @@ -842,6 +841,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. + denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: latents (`Tensor`): @@ -887,8 +888,7 @@ def outputs(self): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img - task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: @@ -942,6 +942,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. + denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: latents (`Tensor`): @@ -1065,7 +1067,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): latents (`Tensor`): The latents to decode, can be generated in the denoise step output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): @@ -1085,8 +1087,7 @@ def description(self): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask - overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. Components: @@ -1098,7 +1099,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): latents (`Tensor`): The latents to decode, can be generated in the denoise step output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`None`, *optional*): TODO: Add description. @@ -1182,8 +1183,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -1228,7 +1232,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. image_latents (`None`, *optional*): TODO: Add description. @@ -1244,8 +1248,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`None`, *optional*): TODO: Add description. diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 0bfbb921c9c4..46e8881b9521 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -74,10 +74,11 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) @@ -376,8 +377,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -452,7 +452,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -536,7 +536,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -630,7 +630,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): latents (`Tensor`): The latents to decode, can be generated in the denoise step output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): @@ -650,8 +650,7 @@ def description(self): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask - overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. Components: @@ -663,7 +662,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): latents (`Tensor`): The latents to decode, can be generated in the denoise step output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`None`, *optional*): TODO: Add description. @@ -722,8 +721,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide - `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` Components: @@ -750,10 +748,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) @@ -790,10 +789,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`None`, *optional*): TODO: Add description. diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 8dab6fbcf95d..1fb967bf1322 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -67,10 +67,11 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -99,7 +100,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"), + QwenImageEditPlusResizeStep(), QwenImageEditPlusTextEncoderStep(), ] block_names = ["resize", "encode"] @@ -145,7 +146,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ - QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"), + QwenImageEditPlusResizeStep(), QwenImageEditPlusProcessImagesInputStep(), QwenImageVaeEncoderStep(), ] @@ -268,7 +269,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -325,7 +326,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): latents (`Tensor`): The latents to decode, can be generated in the denoise step output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): @@ -386,10 +387,11 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -418,10 +420,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 544b1abfc3ed..7d6c2ea0635a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -53,8 +53,7 @@ # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not - provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. Components: @@ -71,23 +70,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -97,11 +101,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -300,7 +309,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -381,23 +390,28 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -407,11 +421,16 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) @@ -444,10 +463,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): + denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt''. + Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): From fb15752d5538c4e4ec95d8164630cbc374002405 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 08:10:31 +0100 Subject: [PATCH 16/23] up up up --- .../modular_pipeline_utils.py | 35 +++++--- .../qwenimage/before_denoise.py | 79 +++++++++++++------ .../modular_pipelines/qwenimage/decoders.py | 63 ++++++++++++--- .../modular_pipelines/qwenimage/denoise.py | 63 +++++++++++---- .../modular_pipelines/qwenimage/encoders.py | 2 +- .../modular_pipelines/qwenimage/inputs.py | 20 ++--- .../qwenimage/modular_blocks_qwenimage.py | 24 +++--- .../modular_blocks_qwenimage_edit.py | 20 ++--- .../modular_blocks_qwenimage_edit_plus.py | 10 +-- .../modular_blocks_qwenimage_layered.py | 8 +- 10 files changed, 216 insertions(+), 108 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index a65aa43b2a3b..5ef1b98f1ba3 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -397,6 +397,7 @@ class ConfigSpec: "description": "Additional kwargs for attention processors.", }, "denoiser_input_fields": { + "name": None, "kwargs_type": "denoiser_input_fields", "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", }, @@ -509,6 +510,7 @@ class ConfigSpec: } +@dataclass class InputParam: """Specification for an input parameter.""" @@ -519,20 +521,22 @@ class InputParam: description: str = "" kwargs_type: str = None - def __post_init__(self): - if self.required and self.default is not None: - raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value") - def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" @classmethod - def template(cls, name: str, note: str = None, **overrides) -> "InputParam": + def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam": """Get template for name if exists, otherwise raise ValueError.""" - if name not in INPUT_PARAM_TEMPLATES: - raise ValueError(f"InputParam template for {name} not found") + if template_name not in INPUT_PARAM_TEMPLATES: + raise ValueError(f"InputParam template for {template_name} not found") - template_kwargs = INPUT_PARAM_TEMPLATES[name].copy() + template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy() + + # Determine the actual param name: + # 1. From overrides if provided + # 2. From template if present + # 3. Fall back to template_name + name = overrides.pop("name", template_kwargs.pop("name", template_name)) if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" @@ -541,6 +545,7 @@ def template(cls, name: str, note: str = None, **overrides) -> "InputParam": return cls(name=name, **template_kwargs) +@dataclass class OutputParam: """Specification for an output parameter.""" @@ -555,12 +560,18 @@ def __repr__(self): ) @classmethod - def template(cls, name: str, note: str = None, **overrides) -> "OutputParam": + def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam": """Get template for name if exists, otherwise raise ValueError.""" - if name not in OUTPUT_PARAM_TEMPLATES: - raise ValueError(f"OutputParam template for {name} not found") + if template_name not in OUTPUT_PARAM_TEMPLATES: + raise ValueError(f"OutputParam template for {template_name} not found") + + template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy() - template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy() + # Determine the actual param name: + # 1. From overrides if provided + # 2. From template if present + # 3. Fall back to template_name + name = overrides.pop("name", template_kwargs.pop("name", template_name)) if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index b87c3555aad3..fc795b5f5a2f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -146,8 +146,8 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="height", type_hint=int, description="updated to default value if not provided"), - OutputParam(name="width", type_hint=int, description="updated to default value if not provided"), + OutputParam(name="height", type_hint=int, description="if not set, updated to default value"), + OutputParam(name="width", type_hint=int, description="if not set, updated to default value"), OutputParam( name="latents", type_hint=torch.Tensor, @@ -230,8 +230,8 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="height", type_hint=int, description="updated to default value if not provided"), - OutputParam(name="width", type_hint=int, description="updated to default value if not provided"), + OutputParam(name="height", type_hint=int, description="if not set, updated to default value"), + OutputParam(name="width", type_hint=int, description="if not set, updated to default value"), OutputParam( name="latents", type_hint=torch.Tensor, @@ -307,8 +307,13 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The initial random noised, can be generated in prepare latent step.", ), - InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), - InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."), + InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."), + InputParam( + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + ), ] @property @@ -322,7 +327,7 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( name="latents", type_hint=torch.Tensor, - description="The scalednoisy latents to use for inpainting/image-to-image denoising.", + description="The scaled noisy latents to use for inpainting/image-to-image denoising.", ), ] @@ -383,8 +388,8 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The processed mask to use for the inpainting process.", ), - InputParam.template("height", required=True, note="should be updated in prepare latents step."), - InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("dtype"), ] @@ -447,7 +452,12 @@ def inputs(self) -> List[InputParam]: return [ InputParam.template("num_inference_steps"), InputParam.template("sigmas"), - InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial random noised latents for the denoising process. Can be generated in prepare latents step." + ), ] @property @@ -456,7 +466,6 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process" ), - OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"), ] def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -515,8 +524,11 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"), - OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"), + OutputParam( + name="timesteps", + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process." + ), ] @torch.no_grad() @@ -568,7 +580,12 @@ def inputs(self) -> List[InputParam]: return [ InputParam.template("num_inference_steps"), InputParam.template("sigmas"), - InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."), + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process. Can be generated in prepare latents step." + ), InputParam.template("strength", default=0.9), ] @@ -583,7 +600,7 @@ def intermediate_outputs(self) -> List[OutputParam]: OutputParam( name="num_inference_steps", type_hint=int, - description="The number of denoising steps to perform at inference time", + description="The number of denoising steps to perform at inference time. Updated based on strength.", ), ] @@ -643,8 +660,8 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam.template("height", note="should be updated in prepare latents step."), - InputParam.template("width", note="should be updated in prepare latents step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), InputParam.template("negative_prompt_embeds_mask"), ] @@ -711,8 +728,8 @@ def inputs(self) -> List[InputParam]: InputParam.template("batch_size"), InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."), InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."), - InputParam.template("height", required=True, note="should be updated in prepare latents step."), - InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), InputParam.template("negative_prompt_embeds_mask"), ] @@ -788,10 +805,10 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."), + InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."), InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."), - InputParam.template("height", required=True, note="should be updated in prepare latents step."), - InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), InputParam.template("negative_prompt_embeds_mask"), ] @@ -863,8 +880,8 @@ def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), InputParam.template("layers"), - InputParam.template("height", required=True, note="should be updated in prepare latents step."), - InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), InputParam.template("negative_prompt_embeds_mask"), ] @@ -950,8 +967,18 @@ def inputs(self) -> List[InputParam]: InputParam.template("control_guidance_start"), InputParam.template("control_guidance_end"), InputParam.template("controlnet_conditioning_scale"), - InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."), - InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."), + InputParam( + name="control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step." + ), + InputParam( + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + ), ] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 499f0172888b..4476e1db9bad 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import Any, Dict, List import torch @@ -47,15 +47,24 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("height", required=True, note="should be updated in input and prepare latents step."), - InputParam.template("width", required=True, note="should be updated in input and prepare latents step."), - InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."), + InputParam.template("height", required=True), + InputParam.template("width", required=True), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to decode, can be generated in the denoise step." + ), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam.template("latents", note="unpacked to B, C, 1, H, W"), + OutputParam( + name="latents", + type_hint=torch.Tensor, + description="The denoisedlatents unpacked to B, C, 1, H, W" + ), ] @torch.no_grad() @@ -87,9 +96,14 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."), - InputParam.template("height", required=True, note="should be updated in prepare latents step."), - InputParam.template("width", required=True, note="should be updated in prepare latents step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step." + ), + InputParam.template("height", required=True), + InputParam.template("width", required=True), InputParam.template("layers"), ] @@ -135,7 +149,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + ), ] @property @@ -192,7 +211,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + ), InputParam.template("output_type"), ] @@ -266,7 +290,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("images", required=True, description="the generated image tensor from decoders step"), + InputParam( + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step" + ), InputParam.template("output_type"), ] @@ -315,9 +344,17 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam("images", required=True, description="the generated image tensor from decoders step"), + InputParam( + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step" + ), InputParam.template("output_type"), - InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."), + InputParam( + name="mask_overlay_kwargs", + type_hint=Dict[str, Any], + description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."), ] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 49fde3fd6ac3..ad6a9677aca3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -49,7 +49,12 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + ), ] @torch.no_grad() @@ -74,8 +79,13 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."), - InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + ), + InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."), ] @torch.no_grad() @@ -119,10 +129,13 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", ), - InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."), - InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."), - InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."), - InputParam.template("denoiser_input_fields") + InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."), + InputParam( + name="controlnet_keep", + required=True, + type_hint=List[float], + description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step." + ), ] @torch.no_grad() @@ -184,8 +197,13 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam.template("attention_kwargs"), - InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."), - InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process. Can be generated in prepare_latents step." + ), + InputParam.template("num_inference_steps"), InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", @@ -275,8 +293,13 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam.template("attention_kwargs"), - InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."), - InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process. Can be generated in prepare_latents step." + ), + InputParam.template("num_inference_steps"), InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", @@ -404,14 +427,19 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), + InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."), InputParam( "initial_noise", required=True, type_hint=torch.Tensor, description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."), + InputParam( + "timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + ), ] @torch.no_grad() @@ -452,8 +480,13 @@ def loop_expected_components(self) -> List[ComponentSpec]: @property def loop_inputs(self) -> List[InputParam]: return [ - InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."), - InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."), + InputParam( + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + ), + InputParam.template("num_inference_steps", required=True), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 82a3b6811959..9a83f0d7178a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -1145,7 +1145,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep." + return "Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images." @property def expected_components(self) -> List[ComponentSpec]: diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index bd2f79ae7c4c..b237031b91d2 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -139,8 +139,8 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam.template("batch_size"), - OutputParam.template("dtype"), + OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"), + OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"), OutputParam.template("prompt_embeds", note="batch-expanded"), OutputParam.template("prompt_embeds_mask", note="batch-expanded"), OutputParam.template("negative_prompt_embeds", note="batch-expanded"), @@ -307,8 +307,8 @@ def intermediate_outputs(self) -> List[OutputParam]: # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided")) - outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided")) + outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) + outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) # image latent inputs are modified in place (patchified and batch-expanded) for input_param in self._image_latent_inputs: @@ -476,8 +476,8 @@ def intermediate_outputs(self) -> List[OutputParam]: # `height`/`width` are updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided")) - outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided")) + outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) + outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) # image latent inputs are modified in place (patchified, concatenated, and batch-expanded) for input_param in self._image_latent_inputs: @@ -658,8 +658,8 @@ def intermediate_outputs(self) -> List[OutputParam]: ] if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided")) - outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided")) + outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) + outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded) for input_param in self._image_latent_inputs: @@ -759,8 +759,8 @@ def inputs(self) -> List[InputParam]: def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."), - OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"), - OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"), + OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"), + OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 42593a93f98a..46f0b6f6ff5a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam +from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam from .before_denoise import ( QwenImageControlNetBeforeDenoiserStep, QwenImageCreateMaskLatentsStep, @@ -319,7 +319,7 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): """ model_name = "qwenimage" - block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] + block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()] block_names = ["text_inputs", "additional_inputs"] @property @@ -373,7 +373,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): block_classes = [ QwenImageTextInputsStep(), QwenImageAdditionalInputsStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -512,7 +512,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -598,7 +598,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -682,7 +682,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -777,7 +777,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -880,7 +880,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -981,7 +981,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -1042,7 +1042,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -1279,5 +1279,5 @@ def description(self): @property def outputs(self): return [ - OutputParam.images(), + OutputParam.template("images"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 46e8881b9521..158763ce917a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -13,10 +13,11 @@ # limitations under the License. from typing import Optional +import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam +from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam from .before_denoise import ( QwenImageCreateMaskLatentsStep, QwenImageEditRoPEInputsStep, @@ -206,7 +207,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): block_classes = [ QwenImageEditResizeStep(), QwenImageEditInpaintProcessImagesInputStep(), - QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"), + QwenImageVaeEncoderStep(), ] block_names = ["resize", "preprocess", "encode"] @@ -286,7 +287,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]), + QwenImageAdditionalInputsStep(), ] block_names = ["text_inputs", "additional_inputs"] @@ -344,8 +345,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageAdditionalInputsStep( - image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -485,7 +485,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -571,7 +571,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -605,7 +605,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -698,7 +698,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -816,5 +816,5 @@ def description(self): @property def outputs(self): return [ - OutputParam.images(), + OutputParam.template("images"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 1fb967bf1322..a16dee1c7595 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam +from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam from .before_denoise import ( QwenImageEditPlusRoPEInputsStep, QwenImagePrepareLatentsStep, @@ -211,7 +211,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit-plus" block_classes = [ QwenImageTextInputsStep(), - QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]), + QwenImageEditPlusAdditionalInputsStep(), ] block_names = ["text_inputs", "additional_inputs"] @@ -302,7 +302,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -446,5 +446,5 @@ def description(self): @property def outputs(self): return [ - OutputParam.images(), + OutputParam.template("images"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 7d6c2ea0635a..2471750f2e0b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict, OutputParam @@ -255,7 +255,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): model_name = "qwenimage-layered" block_classes = [ QwenImageTextInputsStep(), - QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]), + QwenImageLayeredAdditionalInputsStep(), ] block_names = ["text_inputs", "additional_inputs"] @@ -342,7 +342,7 @@ def description(self): @property def outputs(self): return [ - OutputParam.latents(), + OutputParam.template("latents"), ] @@ -484,5 +484,5 @@ def description(self): @property def outputs(self): return [ - OutputParam.images(), + OutputParam.template("images"), ] From 8d45ff5bf60a804a5eaf05933f028e2ddf9772f6 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:22:04 +0100 Subject: [PATCH 17/23] apply auto docstring --- .../modular_pipeline_utils.py | 4 +- .../qwenimage/before_denoise.py | 312 ++++++++++++- .../modular_pipelines/qwenimage/decoders.py | 112 +++++ .../modular_pipelines/qwenimage/denoise.py | 295 +++++++++++- .../modular_pipelines/qwenimage/encoders.py | 323 +++++++++++++- .../modular_pipelines/qwenimage/inputs.py | 181 +++++++- .../qwenimage/modular_blocks_qwenimage.py | 421 ++++++++---------- .../modular_blocks_qwenimage_edit.py | 273 ++++++------ .../modular_blocks_qwenimage_edit_plus.py | 150 +++---- .../modular_blocks_qwenimage_layered.py | 216 +++------ 10 files changed, 1616 insertions(+), 671 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 5ef1b98f1ba3..6f1010daf219 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -898,12 +898,12 @@ def make_doc_string( # Add components section if provided if expected_components and len(expected_components) > 0: - components_str = format_components(expected_components, indent_level=2) + components_str = format_components(expected_components, indent_level=2, add_empty_lines=False) output += components_str + "\n\n" # Add configs section if provided if expected_configs and len(expected_configs) > 0: - configs_str = format_configs(expected_configs, indent_level=2) + configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False) output += configs_str + "\n\n" # Add inputs section diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index fc795b5f5a2f..0b8cd0f4b2d2 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -117,8 +117,39 @@ def get_timesteps(scheduler, num_inference_steps, strength): # 1. PREPARE LATENTS # ==================== - +# auto_docstring class QwenImagePrepareLatentsStep(ModularPipelineBlocks): + """ + Prepare initial random noise for the generation process + + Components: + pachifier (`QwenImagePachifier`) + + Inputs: + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + dtype (`dtype`, *optional*, defaults to torch.float32): + The dtype of the model inputs, can be generated in input step. + + Outputs: + height (`int`): + if not set, updated to default value + width (`int`): + if not set, updated to default value + latents (`Tensor`): + The initial latents to use for the denoising process + """ model_name = "qwenimage" @property @@ -201,7 +232,41 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): + """ + Prepare initial random noise (B, layers+1, C, H, W) for the generation process + + Components: + pachifier (`QwenImageLayeredPachifier`) + + Inputs: + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + layers (`int`, *optional*, defaults to 4): + Number of layers to extract from the image + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + dtype (`dtype`, *optional*, defaults to torch.float32): + The dtype of the model inputs, can be generated in input step. + + Outputs: + height (`int`): + if not set, updated to default value + width (`int`): + if not set, updated to default value + latents (`Tensor`): + The initial latents to use for the denoising process + """ model_name = "qwenimage-layered" @property @@ -285,7 +350,29 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): + """ + Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + latents (`Tensor`): + The initial random noised, can be generated in prepare latent step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from + vae encoder and updated in input step.) + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + + Outputs: + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + latents (`Tensor`): + The scaled noisy latents to use for inpainting/image-to-image denoising. + """ model_name = "qwenimage" @property @@ -366,7 +453,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks): + """ + Step that creates mask latents from preprocessed mask_image by interpolating to latent space. + + Components: + pachifier (`QwenImagePachifier`) + + Inputs: + processed_mask_image (`Tensor`): + The processed mask to use for the inpainting process. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + dtype (`dtype`, *optional*, defaults to torch.float32): + The dtype of the model inputs, can be generated in input step. + + Outputs: + mask (`Tensor`): + The mask to use for the inpainting process. + """ model_name = "qwenimage" @property @@ -433,8 +541,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 2. SET TIMESTEPS # ==================== - +# auto_docstring class QwenImageSetTimestepsStep(ModularPipelineBlocks): + """ + Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + latents (`Tensor`): + The initial random noised latents for the denoising process. Can be generated in prepare latents step. + + Outputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process + """ model_name = "qwenimage" @property @@ -500,7 +626,27 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): + """ + Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from + vae encoder and packed in input step.) + + Outputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. + """ model_name = "qwenimage-layered" @property @@ -562,7 +708,30 @@ def __call__(self, components, state: PipelineState) -> PipelineState: return components, state +# auto_docstring class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): + """ + Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + latents (`Tensor`): + The latents to use for the denoising process. Can be generated in prepare latents step. + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + Outputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. + num_inference_steps (`int`): + The number of denoising steps to perform at inference time. Updated based on strength. + """ model_name = "qwenimage" @property @@ -646,8 +815,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - ## RoPE inputs for denoiser - +# auto_docstring class QwenImageRoPEInputsStep(ModularPipelineBlocks): + """ + Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step + + Inputs: + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + + Outputs: + img_shapes (`List`): + The shapes of the images latents, used for RoPE calculation + txt_seq_lens (`List`): + The sequence lengths of the prompt embeds, used for RoPE calculation + negative_txt_seq_lens (`List`): + The sequence lengths of the negative prompt embeds, used for RoPE calculation + """ model_name = "qwenimage" @property @@ -715,7 +908,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): + """ + Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step + + Inputs: + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + image_height (`int`): + The height of the reference image. Can be generated in input step. + image_width (`int`): + The width of the reference image. Can be generated in input step. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + + Outputs: + img_shapes (`List`): + The shapes of the images latents, used for RoPE calculation + txt_seq_lens (`List`): + The sequence lengths of the prompt embeds, used for RoPE calculation + negative_txt_seq_lens (`List`): + The sequence lengths of the negative prompt embeds, used for RoPE calculation + """ model_name = "qwenimage" @property @@ -790,7 +1012,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): + """ + Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus. + Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. + Should be placed after prepare_latents step. + + Inputs: + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + image_height (`List`): + The heights of the reference images. Can be generated in input step. + image_width (`List`): + The widths of the reference images. Can be generated in input step. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + + Outputs: + img_shapes (`List`): + The shapes of the image latents, used for RoPE calculation + txt_seq_lens (`List`): + The sequence lengths of the prompt embeds, used for RoPE calculation + negative_txt_seq_lens (`List`): + The sequence lengths of the negative prompt embeds, used for RoPE calculation + """ model_name = "qwenimage-edit-plus" @property @@ -866,7 +1119,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): + """ + Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step + + Inputs: + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + layers (`int`, *optional*, defaults to 4): + Number of layers to extract from the image + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + + Outputs: + img_shapes (`List`): + The shapes of the image latents, used for RoPE calculation + txt_seq_lens (`List`): + The sequence lengths of the prompt embeds, used for RoPE calculation + negative_txt_seq_lens (`List`): + The sequence lengths of the negative prompt embeds, used for RoPE calculation + additional_t_cond (`Tensor`): + The additional t cond, used for RoPE calculation + """ model_name = "qwenimage-layered" @property @@ -948,7 +1230,31 @@ def __call__(self, components, state: PipelineState) -> PipelineState: ## ControlNet inputs for denoiser + +# auto_docstring class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): + """ + step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step. + + Components: + controlnet (`QwenImageControlNetModel`) + + Inputs: + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + control_image_latents (`Tensor`): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + + Outputs: + controlnet_keep (`List`): + The controlnet keep values + """ model_name = "qwenimage" @property diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 4476e1db9bad..650bf34da7a3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -29,7 +29,27 @@ # after denoising loop (unpack latents) + +#auto_docstring class QwenImageAfterDenoiseStep(ModularPipelineBlocks): + """ + Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width) + + Components: + pachifier (`QwenImagePachifier`) + + Inputs: + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + latents (`Tensor`): + The latents to decode, can be generated in the denoise step. + + Outputs: + latents (`Tensor`): + The denoisedlatents unpacked to B, C, 1, H, W + """ model_name = "qwenimage" @property @@ -80,7 +100,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +#auto_docstring class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): + """ + Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising. + + Components: + pachifier (`QwenImageLayeredPachifier`) + + Inputs: + latents (`Tensor`): + The denoised latents to decode, can be generated in the denoise step. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + layers (`int`, *optional*, defaults to 4): + Number of layers to extract from the image + + Outputs: + latents (`Tensor`): + Denoised latents. (unpacked to B, C, layers+1, H, W) + """ model_name = "qwenimage-layered" @property @@ -131,7 +172,23 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # decode step + +#auto_docstring class QwenImageDecoderStep(ModularPipelineBlocks): + """ + Step that decodes the latents to images + + Components: + vae (`AutoencoderKLQwenImage`) + + Inputs: + latents (`Tensor`): + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + + Outputs: + images (`List`): + Generated images. (tensor output of the vae decoder.) + """ model_name = "qwenimage" @property @@ -189,7 +246,25 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +#auto_docstring class QwenImageLayeredDecoderStep(ModularPipelineBlocks): + """ + Decode unpacked latents (B, C, layers+1, H, W) into layer images. + + Components: + vae (`AutoencoderKLQwenImage`) + image_processor (`VaeImageProcessor`) + + Inputs: + latents (`Tensor`): + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt'. + + Outputs: + images (`List`): + Generated images. + """ model_name = "qwenimage-layered" @property @@ -269,7 +344,25 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # postprocess the decoded images + +#auto_docstring class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): + """ + postprocess the generated image + + Components: + image_processor (`VaeImageProcessor`) + + Inputs: + images (`Tensor`): + the generated image tensor from decoders step + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt'. + + Outputs: + images (`List`): + Generated images. + """ model_name = "qwenimage" @property @@ -323,7 +416,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +#auto_docstring class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): + """ + postprocess the generated image, optional apply the mask overally to the original image.. + + Components: + image_mask_processor (`InpaintProcessor`) + + Inputs: + images (`Tensor`): + the generated image tensor from decoders step + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt'. + mask_overlay_kwargs (`Dict`, *optional*): + The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + + Outputs: + images (`List`): + Generated images. + """ model_name = "qwenimage" @property diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index ad6a9677aca3..ff6e411d7632 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -85,7 +85,7 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." ), - InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."), + InputParam.template("image_latents"), ] @torch.no_grad() @@ -197,13 +197,6 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam.template("attention_kwargs"), - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare_latents step." - ), - InputParam.template("num_inference_steps"), InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", @@ -293,13 +286,6 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam.template("attention_kwargs"), - InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare_latents step." - ), - InputParam.template("num_inference_steps"), InputParam.template("denoiser_input_fields"), InputParam( "img_shapes", @@ -427,19 +413,19 @@ def inputs(self) -> List[InputParam]: type_hint=torch.Tensor, description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."), + InputParam.template("image_latents"), InputParam( "initial_noise", required=True, type_hint=torch.Tensor, description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.", ), - InputParam( - "timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." - ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam.template("latents"), ] @torch.no_grad() @@ -521,6 +507,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # auto_docstring class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageLoopBeforeDenoiser` + - `QwenImageLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + This block supports text2image and image2image tasks for QwenImage. + + Components: + guider (`ClassifierFreeGuidance`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage" block_classes = [ @@ -546,6 +564,45 @@ def description(self) -> str: # Qwen Image (inpainting) # auto_docstring class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageLoopBeforeDenoiser` + - `QwenImageLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + - `QwenImageLoopAfterDenoiserInpaint` + This block supports inpainting tasks for QwenImage. + + Components: + guider (`ClassifierFreeGuidance`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. + mask (`Tensor`): + The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + initial_noise (`Tensor`): + The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -572,6 +629,46 @@ def description(self) -> str: # Qwen Image (text2image, image2image) with controlnet # auto_docstring class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageLoopBeforeDenoiser` + - `QwenImageLoopBeforeDenoiserControlNet` + - `QwenImageLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + This block supports text2img/img2img tasks with controlnet for QwenImage. + + Components: + guider (`ClassifierFreeGuidance`) + controlnet (`QwenImageControlNetModel`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + control_image_latents (`Tensor`): + The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step. + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.) + controlnet_keep (`List`): + The controlnet keep values. Can be generated in prepare_controlnet_inputs step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -598,6 +695,53 @@ def description(self) -> str: # Qwen Image (inpainting) with controlnet # auto_docstring class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageLoopBeforeDenoiser` + - `QwenImageLoopBeforeDenoiserControlNet` + - `QwenImageLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + - `QwenImageLoopAfterDenoiserInpaint` + This block supports inpainting tasks with controlnet for QwenImage. + + Components: + guider (`ClassifierFreeGuidance`) + controlnet (`QwenImageControlNetModel`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + control_image_latents (`Tensor`): + The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step. + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.) + controlnet_keep (`List`): + The controlnet keep values. Can be generated in prepare_controlnet_inputs step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. + mask (`Tensor`): + The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + initial_noise (`Tensor`): + The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -632,6 +776,40 @@ def description(self) -> str: # Qwen Image Edit (image2image) # auto_docstring class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageEditLoopBeforeDenoiser` + - `QwenImageEditLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + This block supports QwenImage Edit. + + Components: + guider (`ClassifierFreeGuidance`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -656,6 +834,45 @@ def description(self) -> str: # Qwen Image Edit (inpainting) # auto_docstring class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageEditLoopBeforeDenoiser` + - `QwenImageEditLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + - `QwenImageLoopAfterDenoiserInpaint` + This block supports inpainting tasks for QwenImage Edit. + + Components: + guider (`ClassifierFreeGuidance`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. + mask (`Tensor`): + The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. + initial_noise (`Tensor`): + The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -682,6 +899,40 @@ def description(self) -> str: # Qwen Image Layered (image2image) # auto_docstring class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): + """ + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method + At each iteration, it runs blocks defined in `sub_blocks` sequencially: + - `QwenImageEditLoopBeforeDenoiser` + - `QwenImageEditLoopDenoiser` + - `QwenImageLoopAfterDenoiser` + This block supports QwenImage Layered. + + Components: + guider (`ClassifierFreeGuidance`) + transformer (`QwenImageTransformer2DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + num_inference_steps (`int`): + The number of denoising steps. + latents (`Tensor`): + The initial latents to use for the denoising process. Can be generated in prepare_latent step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + **denoiser_input_fields (`None`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + img_shapes (`List`): + The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ model_name = "qwenimage-layered" block_classes = [ QwenImageEditLoopBeforeDenoiser, diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 9a83f0d7178a..083ee507ccbb 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -276,7 +276,23 @@ def encode_vae_image( # # In most of our other pipelines, resizing is done as part of the image preprocessing step. # ==================== + +# auto_docstring class QwenImageEditResizeStep(ModularPipelineBlocks): + """ + Image Resize step that resize the image to target area while maintaining the aspect ratio. + + Components: + image_resize_processor (`VaeImageProcessor`) + + Inputs: + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. + + Outputs: + resized_image (`List`): + The resized images + """ model_name = "qwenimage-edit" @@ -334,7 +350,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageLayeredResizeStep(ModularPipelineBlocks): + """ + Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio. + + Components: + image_resize_processor (`VaeImageProcessor`) + + Inputs: + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. + resolution (`int`, *optional*, defaults to 640): + The target area to resize the image to, can be 1024 or 640 + + Outputs: + resized_image (`List`): + The resized images + """ model_name = "qwenimage-layered" @property @@ -405,7 +438,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditPlusResizeStep(ModularPipelineBlocks): + """ + Resize images for QwenImage Edit Plus pipeline. + Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding. + Each image is resized independently based on its own aspect ratio. + + Components: + image_resize_processor (`VaeImageProcessor`) + + Inputs: + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. + + Outputs: + resized_image (`List`): + Images resized to 1024x1024 target area for VAE encoding + resized_cond_image (`List`): + Images resized to 384x384 target area for VL text encoding + """ model_name = "qwenimage-edit-plus" @@ -488,7 +540,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # ==================== # 2. GET IMAGE PROMPT # ==================== + +# auto_docstring class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): + """ + Auto-caption step that generates a text prompt from the input image if none is provided. + Uses the VL model (text_encoder) to generate a description of the image. + If prompt is already provided, this step passes through unchanged. + + Components: + text_encoder (`Qwen2_5_VLForConditionalGeneration`) + processor (`Qwen2VLProcessor`) + + Inputs: + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + resized_image (`Image`): + The image to generate caption from, should be resized use the resize step + use_en_prompt (`bool`, *optional*, defaults to False): + Whether to use English prompt template + + Outputs: + prompt (`str`): + The prompt or prompts to guide image generation. If not provided, updated using image caption + """ model_name = "qwenimage-layered" @@ -530,6 +605,16 @@ def inputs(self) -> List[InputParam]: ), ] + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="prompt", + type_hint=str, + description="The prompt or prompts to guide image generation. If not provided, updated using image caption", + ), + ] + @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) @@ -567,7 +652,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # ==================== # 3. TEXT ENCODER # ==================== + +# auto_docstring class QwenImageTextEncoderStep(ModularPipelineBlocks): + """ + Text Encoder step that generates text embeddings to guide the image generation. + + Components: + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use + tokenizer (`Qwen2Tokenizer`): The tokenizer to use + guider (`ClassifierFreeGuidance`) + + Inputs: + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + Outputs: + prompt_embeds (`Tensor`): + The prompt embeddings. + prompt_embeds_mask (`Tensor`): + The encoder attention mask. + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. + """ model_name = "qwenimage" def __init__(self): @@ -670,7 +783,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditTextEncoderStep(ModularPipelineBlocks): + """ + Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation. + + Components: + text_encoder (`Qwen2_5_VLForConditionalGeneration`) + processor (`Qwen2VLProcessor`) + guider (`ClassifierFreeGuidance`) + + Inputs: + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + resized_image (`Image`): + The image prompt to encode, should be resized using resize step + + Outputs: + prompt_embeds (`Tensor`): + The prompt embeddings. + prompt_embeds_mask (`Tensor`): + The encoder attention mask. + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. + """ model_name = "qwenimage" def __init__(self): @@ -766,7 +906,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): + """ + Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation. + + Components: + text_encoder (`Qwen2_5_VLForConditionalGeneration`) + processor (`Qwen2VLProcessor`) + guider (`ClassifierFreeGuidance`) + + Inputs: + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + resized_cond_image (`Tensor`): + The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step + + Outputs: + prompt_embeds (`Tensor`): + The prompt embeddings. + prompt_embeds_mask (`Tensor`): + The encoder attention mask. + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. + """ model_name = "qwenimage-edit-plus" @@ -874,7 +1041,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # ==================== # 4. IMAGE PREPROCESS # ==================== + +# auto_docstring class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): + """ + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width. + + Components: + image_mask_processor (`InpaintProcessor`) + + Inputs: + mask_image (`Image`): + Mask image for inpainting. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + Outputs: + processed_image (`Tensor`): + The processed image + processed_mask_image (`Tensor`): + The processed mask image + mask_overlay_kwargs (`Dict`): + The kwargs for the postprocess step to apply the mask overlay + """ model_name = "qwenimage" @property @@ -954,7 +1149,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): + """ + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first. + + Components: + image_mask_processor (`InpaintProcessor`) + + Inputs: + mask_image (`Image`): + Mask image for inpainting. + resized_image (`Image`): + The resized image. should be generated using a resize step + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + Outputs: + processed_image (`Tensor`): + The processed image + processed_mask_image (`Tensor`): + The processed mask image + mask_overlay_kwargs (`Dict`): + The kwargs for the postprocess step to apply the mask overlay + """ model_name = "qwenimage-edit" @property @@ -1025,7 +1243,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageProcessImagesInputStep(ModularPipelineBlocks): + """ + Image Preprocess step. will resize the image to the given height and width. + + Components: + image_processor (`VaeImageProcessor`) + + Inputs: + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + + Outputs: + processed_image (`Tensor`): + The processed image + """ model_name = "qwenimage" @property @@ -1087,7 +1324,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): + """ + Image Preprocess step. Images needs to be resized first. + + Components: + image_processor (`VaeImageProcessor`) + + Inputs: + resized_image (`List`): + The resized image. should be generated using a resize step + + Outputs: + processed_image (`Tensor`): + The processed image + """ model_name = "qwenimage-edit" @property @@ -1140,7 +1392,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state +# auto_docstring class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): + """ + Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images. + + Components: + image_processor (`VaeImageProcessor`) + + Inputs: + resized_image (`List`): + The resized image. should be generated using a resize step + + Outputs: + processed_image (`Tensor`): + The processed image + """ model_name = "qwenimage-edit-plus" @property @@ -1204,8 +1471,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # ==================== # 5. VAE ENCODER # ==================== + +# auto_docstring class QwenImageVaeEncoderStep(ModularPipelineBlocks): - """VAE encoder that handles both single images and lists of images with varied resolutions.""" + """ + VAE Encoder step that converts processed_image into latent representations image_latents. + Handles both single images and lists of images with varied resolutions. + + Components: + vae (`AutoencoderKLQwenImage`) + + Inputs: + processed_image (`Tensor`): + The image tensor to encode + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + image_latents (`Tensor`): + The latent representation of the input image. + """ model_name = "qwenimage" @@ -1297,7 +1582,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): + """ + VAE Encoder step that converts `control_image` into latent representations control_image_latents. + + Components: + vae (`AutoencoderKLQwenImage`) + controlnet (`QwenImageControlNetModel`) + control_image_processor (`VaeImageProcessor`) + + Inputs: + control_image (`Image`): + Control image for ControlNet conditioning. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + control_image_latents (`Tensor`): + The latents representing the control image + """ model_name = "qwenimage" @property @@ -1411,7 +1719,20 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # ==================== # 6. PERMUTE LATENTS # ==================== + +# auto_docstring class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): + """ + Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing. + + Inputs: + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + + Outputs: + image_latents (`Tensor`): + The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W]) + """ model_name = "qwenimage-layered" @property diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index b237031b91d2..0e03242e5e49 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -109,7 +109,42 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in return height, width +# auto_docstring class QwenImageTextInputsStep(ModularPipelineBlocks): + """ + Text input processing step that standardizes text embeddings for the pipeline. + This step: + 1. Determines `batch_size` and `dtype` based on `prompt_embeds` + 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt) + + This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps. + + Inputs: + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + + Outputs: + batch_size (`int`): + The batch size of the prompt embeddings + dtype (`dtype`): + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) + """ model_name = "qwenimage" @property @@ -217,8 +252,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageAdditionalInputsStep(ModularPipelineBlocks): - """Input step for QwenImage: update height/width, expand batch, patchify.""" + """ + Input processing step that: + 1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size + 2. For additional batch inputs: Expands batch dimensions to match final batch size + + Configured inputs: + - Image latent inputs: ['image_latents'] + + This block should be placed after the encoder steps and the text input step. + + Components: + pachifier (`QwenImagePachifier`) + + Inputs: + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + + Outputs: + image_height (`int`): + The image height calculated from the image latents dimension + image_width (`int`): + The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and + batch-expanded) + """ model_name = "qwenimage" @@ -385,8 +459,48 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): - """Input step for QwenImage Edit Plus: handles list of latents with different sizes.""" + """ + Input processing step for Edit Plus that: + 1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch + 2. For additional batch inputs: Expands batch dimensions to match final batch size + Height/width defaults to last image in the list. + + Configured inputs: + - Image latent inputs: ['image_latents'] + + This block should be placed after the encoder steps and the text input step. + + Components: + pachifier (`QwenImagePachifier`) + + Inputs: + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + + Outputs: + image_height (`List`): + The image heights calculated from the image latents dimension + image_width (`List`): + The image widths calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified, + concatenated, and batch-expanded) + """ model_name = "qwenimage-edit-plus" @@ -571,8 +685,44 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # same as QwenImageAdditionalInputsStep, but with layered pachifier. + +# auto_docstring class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): - """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier.""" + """ + Input processing step for Layered that: + 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size + 2. For additional batch inputs: Expands batch dimensions to match final batch size + + Configured inputs: + - Image latent inputs: ['image_latents'] + + This block should be placed after the encoder steps and the text input step. + + Components: + pachifier (`QwenImageLayeredPachifier`) + + Inputs: + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + + Outputs: + image_height (`int`): + The image height calculated from the image latents dimension + image_width (`int`): + The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered + pachifier and batch-expanded) + """ model_name = "qwenimage-layered" @@ -738,7 +888,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state +# auto_docstring class QwenImageControlNetInputsStep(ModularPipelineBlocks): + """ + prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps. + + Inputs: + control_image_latents (`Tensor`): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + batch_size (`int`, *optional*, defaults to 1): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be + generated in input step. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + + Outputs: + control_image_latents (`Tensor`): + The control image latents (patchified and batch-expanded). + height (`int`): + if not provided, updated to control image height + width (`int`): + if not provided, updated to control image width + """ model_name = "qwenimage" @property diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 46f0b6f6ff5a..b50e41bb5079 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -65,26 +65,10 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 34) - - tokenizer_max_length (default: 1024) - Inputs: prompt (`str`, *optional*): The prompt or prompts to guide image generation. @@ -95,13 +79,13 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Outputs: prompt_embeds (`Tensor`): - The prompt embeddings + The prompt embeddings. prompt_embeds_mask (`Tensor`): - The encoder attention mask + The encoder attention mask. negative_prompt_embeds (`Tensor`): - The negative prompt embeddings + The negative prompt embeddings. negative_prompt_embeds_mask (`Tensor`): - The negative prompt embeddings mask + The negative prompt embeddings mask. """ model_name = "qwenimage" @@ -130,16 +114,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): - Creates `image_latents`. Components: - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: mask_image (`Image`): Mask image for inpainting. - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -150,14 +132,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - TODO: Add description. - processed_mask_image (`None`): - TODO: Add description. + processed_image (`Tensor`): + The processed image + processed_mask_image (`Tensor`): + The processed mask image mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage" @@ -180,14 +162,12 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that preprocess andencode the image inputs into their latent representations. Components: - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -196,10 +176,10 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - TODO: Add description. + processed_image (`Tensor`): + The processed image image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage" @@ -238,11 +218,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): - if `control_image` is not provided, step will be skipped. Components: - vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) Inputs: @@ -286,36 +263,50 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): Input step that prepares the inputs for the img2img denoising step. It: Components: - pachifier (`QwenImagePachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`int`): The image height calculated from the image latents dimension image_width (`int`): The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and + batch-expanded) """ model_name = "qwenimage" @@ -335,38 +326,54 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): Input step that prepares the inputs for the inpainting denoising step. It: Components: - pachifier (`QwenImagePachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`, *optional*): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`int`): The image height calculated from the image latents dimension image_width (`int`): The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and + batch-expanded) + processed_mask_image (`Tensor`): + The processed mask image (batch-expanded) """ model_name = "qwenimage" @@ -394,30 +401,31 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the pachified latents `mask` based on the processedmask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from + vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - TODO: Add description. - width (`None`): - TODO: Add description. - dtype (`None`): - TODO: Add description. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + dtype (`dtype`, *optional*, defaults to torch.float32): + The dtype of the model inputs, can be generated in input step. Outputs: initial_noise (`Tensor`): The initial random noised used for inpainting denoising. + latents (`Tensor`): + The scaled noisy latents to use for inpainting/image-to-image denoising. mask (`Tensor`): The mask to use for the inpainting process. """ @@ -445,26 +453,22 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. height (`int`, *optional*): @@ -479,7 +483,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -523,34 +527,30 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`, *optional*): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -609,32 +609,28 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -647,7 +643,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -693,30 +689,25 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. - control_image_latents (`None`): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + control_image_latents (`Tensor`): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -735,12 +726,9 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): - All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, - txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -788,38 +776,33 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. - control_image_latents (`None`): - TODO: Add description. + image_latents (`Tensor`, *optional*): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image + control_image_latents (`Tensor`): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -836,12 +819,9 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): - All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, - txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -891,36 +871,31 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - control_image_latents (`None`): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + control_image_latents (`Tensor`): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -937,12 +912,9 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): - All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, - txt_seq_lens/negative_txt_seq_lens. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -1058,20 +1030,18 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The latents to decode, can be generated in the denoise step + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): - Generated images. + Generated images. (tensor output of the vae decoder.) """ model_name = "qwenimage" @@ -1090,22 +1060,20 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The latents to decode, can be generated in the denoise step + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`None`, *optional*): - TODO: Add description. + mask_overlay_kwargs (`Dict`, *optional*): + The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: images (`List`): - Generated images. + Generated images. (tensor output of the vae decoder.) """ model_name = "qwenimage" @@ -1157,42 +1125,18 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): - for text-to-image generation, all you need to provide is `prompt` Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 34) - - tokenizer_max_length (default: 1024) - Inputs: prompt (`str`, *optional*): The prompt or prompts to guide image generation. @@ -1202,8 +1146,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Maximum sequence length for prompt encoding. mask_image (`Image`, *optional*): Mask image for inpainting. - image (`Image`, *optional*): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`, *optional*): + Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -1216,14 +1160,14 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Control image for ControlNet conditioning. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -1232,29 +1176,26 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`, *optional*): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_image_latents (`None`, *optional*): - TODO: Add description. + control_image_latents (`Tensor`, *optional*): + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): - All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, - txt_seq_lens/negative_txt_seq_lens. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`None`, *optional*): - TODO: Add description. + mask_overlay_kwargs (`Dict`, *optional*): + The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 158763ce917a..0c1fa00842e5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -63,29 +63,14 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit VL encoder step that encode the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 64) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): @@ -95,13 +80,13 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images prompt_embeds (`Tensor`): - The prompt embeddings + The prompt embeddings. prompt_embeds_mask (`Tensor`): - The encoder attention mask + The encoder attention mask. negative_prompt_embeds (`Tensor`): - The negative prompt embeddings + The negative prompt embeddings. negative_prompt_embeds_mask (`Tensor`): - The negative prompt embeddings mask + The negative prompt embeddings mask. """ model_name = "qwenimage-edit" @@ -128,26 +113,23 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: resized_image (`List`): The resized images - processed_image (`None`): - TODO: Add description. + processed_image (`Tensor`): + The processed image image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage-edit" @@ -173,16 +155,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): - create image latents. Components: - image_resize_processor (`VaeImageProcessor`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. mask_image (`Image`): Mask image for inpainting. padding_mask_crop (`int`, *optional*): @@ -193,14 +172,14 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): Outputs: resized_image (`List`): The resized images - processed_image (`None`): - TODO: Add description. - processed_mask_image (`None`): - TODO: Add description. + processed_image (`Tensor`): + The processed image + processed_mask_image (`Tensor`): + The processed mask image mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage-edit" @@ -252,36 +231,50 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`int`): The image height calculated from the image latents dimension image_width (`int`): The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and + batch-expanded) """ model_name = "qwenimage-edit" @@ -308,38 +301,54 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`int`): The image height calculated from the image latents dimension image_width (`int`): The image width calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and + batch-expanded) + processed_mask_image (`Tensor`): + The processed mask image (batch-expanded) """ model_name = "qwenimage-edit" @@ -368,30 +377,31 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the patchified latents `mask` based on the processed mask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from + vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - TODO: Add description. - width (`None`): - TODO: Add description. - dtype (`None`): - TODO: Add description. + height (`int`): + The height in pixels of the generated image. + width (`int`): + The width in pixels of the generated image. + dtype (`dtype`, *optional*, defaults to torch.float32): + The dtype of the model inputs, can be generated in input step. Outputs: initial_noise (`Tensor`): The initial random noised used for inpainting denoising. + latents (`Tensor`): + The scaled noisy latents to use for inpainting/image-to-image denoising. mask (`Tensor`): The mask to use for the inpainting process. """ @@ -416,32 +426,28 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -452,7 +458,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -496,34 +502,30 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit inpaint task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -536,7 +538,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -621,20 +623,18 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The latents to decode, can be generated in the denoise step + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): - Generated images. + Generated images. (tensor output of the vae decoder.) """ model_name = "qwenimage-edit" @@ -653,22 +653,20 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The latents to decode, can be generated in the denoise step + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`None`, *optional*): - TODO: Add description. + mask_overlay_kwargs (`Dict`, *optional*): + The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: images (`List`): - Generated images. + Generated images. (tensor output of the vae decoder.) """ model_name = "qwenimage-edit" @@ -724,41 +722,20 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 64) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): @@ -775,10 +752,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): The height in pixels of the generated image. width (`int`): The width in pixels of the generated image. - image_latents (`None`): - TODO: Add description. - processed_mask_image (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. + processed_mask_image (`Tensor`, *optional*): + The processed mask image latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -789,12 +766,12 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Strength for img2img/inpainting. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`None`, *optional*): - TODO: Add description. + mask_overlay_kwargs (`Dict`, *optional*): + The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index a16dee1c7595..726c000f4b38 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -55,47 +55,32 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) - - prompt_template_encode_start_idx (default: 64) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. Outputs: + resized_image (`List`): + Images resized to 1024x1024 target area for VAE encoding resized_cond_image (`List`): - The resized images + Images resized to 384x384 target area for VL text encoding prompt_embeds (`Tensor`): - The prompt embeddings + The prompt embeddings. prompt_embeds_mask (`Tensor`): - The encoder attention mask + The encoder attention mask. negative_prompt_embeds (`Tensor`): - The negative prompt embeddings + The negative prompt embeddings. negative_prompt_embeds_mask (`Tensor`): - The negative prompt embeddings mask + The negative prompt embeddings mask. """ model_name = "qwenimage-edit-plus" @@ -122,26 +107,25 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: resized_image (`List`): - The resized images - processed_image (`None`): - TODO: Add description. + Images resized to 1024x1024 target area for VAE encoding + resized_cond_image (`List`): + Images resized to 384x384 target area for VL text encoding + processed_image (`Tensor`): + The processed image image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage-edit-plus" @@ -176,36 +160,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): - Defaults height/width from last image in the list. Components: - pachifier (`QwenImagePachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`List`): The image heights calculated from the image latents dimension image_width (`List`): The image widths calculated from the image latents dimension + height (`int`): + if not provided, updated to image height + width (`int`): + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified, + concatenated, and batch-expanded) """ model_name = "qwenimage-edit-plus" @@ -233,32 +231,28 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - TODO: Add description. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -269,7 +263,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -317,20 +311,18 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocesses the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The latents to decode, can be generated in the denoise step + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. Outputs: images (`List`): - Generated images. + Generated images. (tensor output of the vae decoder.) """ model_name = "qwenimage-edit-plus" @@ -365,41 +357,19 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) - Configs: - - prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) - - prompt_template_encode_start_idx (default: 64) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. negative_prompt (`str`, *optional*): @@ -420,7 +390,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 2471750f2e0b..37a06e9af254 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -56,73 +56,19 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - Configs: - - image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: - 1. Write the caption using natural, descriptive language without structured formats or rich text. - 2. Enrich caption details by including: - - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks - 3. Maintain authenticity and accuracy: - - Avoid generalizations - - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) - - image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: - 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 - 2. 通过加入以下内容,丰富图注细节: - - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 - - 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等 - - 环境细节:例如天气、光照、颜色、纹理、气氛等 - - 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调 - 3. 保持真实性与准确性: - - 不要使用笼统的描述 - - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 34) - - tokenizer_max_length (default: 1024) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 prompt (`str`, *optional*): - The prompt to encode + The prompt or prompts to guide image generation. use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template negative_prompt (`str`, *optional*): @@ -133,14 +79,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Outputs: resized_image (`List`): The resized images + prompt (`str`): + The prompt or prompts to guide image generation. If not provided, updated using image caption prompt_embeds (`Tensor`): - The prompt embeddings + The prompt embeddings. prompt_embeds_mask (`Tensor`): - The encoder attention mask + The encoder attention mask. negative_prompt_embeds (`Tensor`): - The negative prompt embeddings + The negative prompt embeddings. negative_prompt_embeds_mask (`Tensor`): - The negative prompt embeddings mask + The negative prompt embeddings mask. """ model_name = "qwenimage-layered" @@ -168,16 +116,13 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 generator (`Generator`, *optional*): @@ -186,10 +131,10 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): Outputs: resized_image (`List`): The resized images - processed_image (`None`): - TODO: Add description. + processed_image (`Tensor`): + The processed image image_latents (`Tensor`): - The latents representing the reference image(s). Single tensor or list depending on input. + The latent representation of the input image. """ model_name = "qwenimage-layered" @@ -220,36 +165,46 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImageLayeredPachifier`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. - image_latents (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: batch_size (`int`): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + The batch size of the prompt embeddings dtype (`dtype`): - Data type of model tensor inputs (determined by `prompt_embeds`) + The data type of the prompt embeddings + prompt_embeds (`Tensor`): + The prompt embeddings. (batch-expanded) + prompt_embeds_mask (`Tensor`): + The encoder attention mask. (batch-expanded) + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings. (batch-expanded) + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask. (batch-expanded) image_height (`int`): The image height calculated from the image latents dimension image_width (`int`): The image width calculated from the image latents dimension height (`int`): - The height of the image output + if not provided, updated to image height width (`int`): - The width of the image output + if not provided, updated to image width + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered + pachifier and batch-expanded) """ model_name = "qwenimage-layered" @@ -275,28 +230,24 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Layered img2img task. Components: - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - TODO: Add description. - prompt_embeds_mask (`None`): - TODO: Add description. - negative_prompt_embeds (`None`, *optional*): - TODO: Add description. - negative_prompt_embeds_mask (`None`, *optional*): - TODO: Add description. - image_latents (`None`, *optional*): - TODO: Add description. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + image_latents (`Tensor`): + image latents used to guide the image generation. Can be generated from vae_encoder step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. layers (`int`, *optional*, defaults to 4): @@ -309,7 +260,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: @@ -366,83 +317,24 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) - Configs: - - image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: - 1. Write the caption using natural, descriptive language without structured formats or rich text. - 2. Enrich caption details by including: - - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks - 3. Maintain authenticity and accuracy: - - Avoid generalizations - - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) - - image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: - 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 - 2. 通过加入以下内容,丰富图注细节: - - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 - - 对象间的视觉关系:如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等 - - 环境细节:例如天气、光照、颜色、纹理、气氛等 - - 文字内容:识别图像中清晰可见的文字,不做翻译和解释,用引号在图注中强调 - 3. 保持真实性与准确性: - - 不要使用笼统的描述 - - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) - - prompt_template_encode_start_idx (default: 34) - - tokenizer_max_length (default: 1024) - Inputs: - image (`Image`): - Input image for img2img, editing, or conditioning. + image (`Union[Image, List]`): + Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 prompt (`str`, *optional*): - The prompt to encode + The prompt or prompts to guide image generation. use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template negative_prompt (`str`, *optional*): @@ -463,7 +355,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Custom sigmas for the denoising process. attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - denoiser_input_fields (`Tensor`, *optional*): + **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. From f056af1fbb24b79c6cc5360ea782abacd63c34fd Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:27:40 +0100 Subject: [PATCH 18/23] make style --- .../modular_pipeline_utils.py | 18 +- .../qwenimage/before_denoise.py | 133 ++++++++----- .../modular_pipelines/qwenimage/decoders.py | 93 +++++---- .../modular_pipelines/qwenimage/denoise.py | 123 ++++++------ .../modular_pipelines/qwenimage/encoders.py | 177 ++++++++++-------- .../modular_pipelines/qwenimage/inputs.py | 91 ++++++--- .../qwenimage/modular_blocks_qwenimage.py | 136 +++++++------- .../modular_blocks_qwenimage_edit.py | 81 ++++---- .../modular_blocks_qwenimage_edit_plus.py | 37 ++-- .../modular_blocks_qwenimage_layered.py | 40 ++-- 10 files changed, 497 insertions(+), 432 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 6f1010daf219..a57212988e28 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -438,7 +438,7 @@ class ConfigSpec: "description": "Number of layers to extract from the image", }, # common intermediate inputs - "prompt_embeds":{ + "prompt_embeds": { "type_hint": torch.Tensor, "required": True, "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.", @@ -531,16 +531,16 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa raise ValueError(f"InputParam template for {template_name} not found") template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy() - + # Determine the actual param name: # 1. From overrides if provided # 2. From template if present # 3. Fall back to template_name name = overrides.pop("name", template_kwargs.pop("name", template_name)) - + if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" - + template_kwargs.update(overrides) return cls(name=name, **template_kwargs) @@ -564,18 +564,18 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "OutputP """Get template for name if exists, otherwise raise ValueError.""" if template_name not in OUTPUT_PARAM_TEMPLATES: raise ValueError(f"OutputParam template for {template_name} not found") - + template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy() - + # Determine the actual param name: # 1. From overrides if provided # 2. From template if present # 3. Fall back to template_name name = overrides.pop("name", template_kwargs.pop("name", template_name)) - + if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" - + template_kwargs.update(overrides) return cls(name=name, **template_kwargs) @@ -913,4 +913,4 @@ def make_doc_string( output += "\n\n" output += format_output_params(outputs, indent_level=2) - return output \ No newline at end of file + return output diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 0b8cd0f4b2d2..418d927f4faa 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -117,6 +117,7 @@ def get_timesteps(scheduler, num_inference_steps, strength): # 1. PREPARE LATENTS # ==================== + # auto_docstring class QwenImagePrepareLatentsStep(ModularPipelineBlocks): """ @@ -137,8 +138,8 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks): generator (`Generator`, *optional*): Torch generator for deterministic generation. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. dtype (`dtype`, *optional*, defaults to torch.float32): The dtype of the model inputs, can be generated in input step. @@ -150,6 +151,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks): latents (`Tensor`): The initial latents to use for the denoising process """ + model_name = "qwenimage" @property @@ -254,8 +256,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): generator (`Generator`, *optional*): Torch generator for deterministic generation. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. dtype (`dtype`, *optional*, defaults to torch.float32): The dtype of the model inputs, can be generated in input step. @@ -267,6 +269,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): latents (`Tensor`): The initial latents to use for the denoising process """ + model_name = "qwenimage-layered" @property @@ -353,7 +356,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # auto_docstring class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): """ - Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified. + Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, + prepare_latents. Both noise and image latents should alreadybe patchified. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -362,8 +366,8 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -373,6 +377,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): latents (`Tensor`): The scaled noisy latents to use for inpainting/image-to-image denoising. """ + model_name = "qwenimage" @property @@ -396,10 +401,10 @@ def inputs(self) -> List[InputParam]: ), InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."), InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), ] @@ -475,6 +480,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks): mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage" @property @@ -541,10 +547,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 2. SET TIMESTEPS # ==================== + # auto_docstring class QwenImageSetTimestepsStep(ModularPipelineBlocks): """ - Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step. + Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents + step. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -561,6 +569,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks): timesteps (`Tensor`): The timesteps to use for the denoising process """ + model_name = "qwenimage" @property @@ -579,10 +588,10 @@ def inputs(self) -> List[InputParam]: InputParam.template("num_inference_steps"), InputParam.template("sigmas"), InputParam( - name="latents", + name="latents", required=True, type_hint=torch.Tensor, - description="The initial random noised latents for the denoising process. Can be generated in prepare latents step." + description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.", ), ] @@ -640,13 +649,14 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): sigmas (`List`, *optional*): Custom sigmas for the denoising process. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and packed in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and packed in input step.) Outputs: timesteps (`Tensor`): The timesteps to use for the denoising process. """ + model_name = "qwenimage-layered" @property @@ -671,9 +681,7 @@ def inputs(self) -> List[InputParam]: def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="timesteps", - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process." + name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process." ), ] @@ -711,7 +719,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # auto_docstring class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): """ - Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step. + Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after + prepare latents step. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -732,6 +741,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): num_inference_steps (`int`): The number of denoising steps to perform at inference time. Updated based on strength. """ + model_name = "qwenimage" @property @@ -750,10 +760,10 @@ def inputs(self) -> List[InputParam]: InputParam.template("num_inference_steps"), InputParam.template("sigmas"), InputParam( - "latents", - required=True, + "latents", + required=True, type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare latents step." + description="The latents to use for the denoising process. Can be generated in prepare latents step.", ), InputParam.template("strength", default=0.9), ] @@ -815,6 +825,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - ## RoPE inputs for denoiser + # auto_docstring class QwenImageRoPEInputsStep(ModularPipelineBlocks): """ @@ -822,8 +833,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`): The height in pixels of the generated image. width (`int`): @@ -841,6 +852,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage" @property @@ -911,12 +923,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # auto_docstring class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): """ - Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step + Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after + prepare_latents step Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_height (`int`): The height of the reference image. Can be generated in input step. image_width (`int`): @@ -938,6 +951,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage" @property @@ -948,8 +962,18 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."), - InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."), + InputParam( + name="image_height", + required=True, + type_hint=int, + description="The height of the reference image. Can be generated in input step.", + ), + InputParam( + name="image_width", + required=True, + type_hint=int, + description="The width of the reference image. Can be generated in input step.", + ), InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), @@ -1016,13 +1040,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): """ Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus. - Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. - Should be placed after prepare_latents step. + Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed + after prepare_latents step. Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_height (`List`): The heights of the reference images. Can be generated in input step. image_width (`List`): @@ -1044,6 +1068,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage-edit-plus" @property @@ -1058,8 +1083,18 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."), - InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."), + InputParam( + name="image_height", + required=True, + type_hint=List[int], + description="The heights of the reference images. Can be generated in input step.", + ), + InputParam( + name="image_width", + required=True, + type_hint=List[int], + description="The widths of the reference images. Can be generated in input step.", + ), InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), @@ -1126,8 +1161,8 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image height (`int`): @@ -1149,6 +1184,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): additional_t_cond (`Tensor`): The additional t cond, used for RoPE calculation """ + model_name = "qwenimage-layered" @property @@ -1231,6 +1267,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState: ## ControlNet inputs for denoiser + # auto_docstring class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): """ @@ -1247,7 +1284,8 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -1255,6 +1293,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): controlnet_keep (`List`): The controlnet keep values """ + model_name = "qwenimage" @property @@ -1274,16 +1313,16 @@ def inputs(self) -> List[InputParam]: InputParam.template("control_guidance_end"), InputParam.template("controlnet_conditioning_scale"), InputParam( - name="control_image_latents", - required=True, - type_hint=torch.Tensor, - description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step." + name="control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.", ), InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 650bf34da7a3..1adbf6bdd355 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -30,10 +30,12 @@ # after denoising loop (unpack latents) -#auto_docstring + +# auto_docstring class QwenImageAfterDenoiseStep(ModularPipelineBlocks): """ - Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width) + Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, + channels, 1, height, width) Components: pachifier (`QwenImagePachifier`) @@ -50,6 +52,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks): latents (`Tensor`): The denoisedlatents unpacked to B, C, 1, H, W """ + model_name = "qwenimage" @property @@ -70,10 +73,10 @@ def inputs(self) -> List[InputParam]: InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to decode, can be generated in the denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to decode, can be generated in the denoise step.", ), ] @@ -81,9 +84,7 @@ def inputs(self) -> List[InputParam]: def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="latents", - type_hint=torch.Tensor, - description="The denoisedlatents unpacked to B, C, 1, H, W" + name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W" ), ] @@ -100,7 +101,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -#auto_docstring +# auto_docstring class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): """ Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising. @@ -122,6 +123,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): latents (`Tensor`): Denoised latents. (unpacked to B, C, layers+1, H, W) """ + model_name = "qwenimage-layered" @property @@ -138,10 +140,10 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step.", ), InputParam.template("height", required=True), InputParam.template("width", required=True), @@ -173,7 +175,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # decode step -#auto_docstring + +# auto_docstring class QwenImageDecoderStep(ModularPipelineBlocks): """ Step that decodes the latents to images @@ -183,12 +186,14 @@ class QwenImageDecoderStep(ModularPipelineBlocks): Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. Outputs: images (`List`): Generated images. (tensor output of the vae decoder.) """ + model_name = "qwenimage" @property @@ -207,10 +212,10 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.", ), ] @@ -246,18 +251,18 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - return components, state -#auto_docstring +# auto_docstring class QwenImageLayeredDecoderStep(ModularPipelineBlocks): """ Decode unpacked latents (B, C, layers+1, H, W) into layer images. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -265,6 +270,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-layered" @property @@ -287,10 +293,10 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.", ), InputParam.template("output_type"), ] @@ -345,7 +351,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState: # postprocess the decoded images -#auto_docstring + +# auto_docstring class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): """ postprocess the generated image @@ -363,6 +370,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage" @property @@ -384,10 +392,10 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="images", - required=True, - type_hint=torch.Tensor, - description="the generated image tensor from decoders step" + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step", ), InputParam.template("output_type"), ] @@ -416,7 +424,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state -#auto_docstring +# auto_docstring class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): """ postprocess the generated image, optional apply the mask overally to the original image.. @@ -430,12 +438,14 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): Generated images. """ + model_name = "qwenimage" @property @@ -457,16 +467,17 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="images", - required=True, - type_hint=torch.Tensor, - description="the generated image tensor from decoders step" + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step", ), InputParam.template("output_type"), InputParam( - name="mask_overlay_kwargs", + name="mask_overlay_kwargs", type_hint=Dict[str, Any], - description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."), + description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.", + ), ] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index ff6e411d7632..3b00fcb274df 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -50,10 +50,10 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", ), ] @@ -80,10 +80,10 @@ def description(self) -> str: def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", ), InputParam.template("image_latents"), ] @@ -131,10 +131,10 @@ def inputs(self) -> List[InputParam]: ), InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."), InputParam( - name="controlnet_keep", - required=True, - type_hint=List[float], - description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step." + name="controlnet_keep", + required=True, + type_hint=List[float], + description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.", ), ] @@ -467,10 +467,10 @@ def loop_expected_components(self) -> List[ComponentSpec]: def loop_inputs(self) -> List[InputParam]: return [ InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), InputParam.template("num_inference_steps", required=True), ] @@ -505,21 +505,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # Qwen Image (text2image, image2image) + # auto_docstring class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports text2image and image2image tasks for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -539,6 +539,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ @@ -551,8 +552,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): @property def description(self) -> str: return ( - "Denoise step that iteratively denoise the latents. \n" - "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "Denoise step that iteratively denoise the latents.\n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n" "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" " - `QwenImageLoopBeforeDenoiser`\n" " - `QwenImageLoopDenoiser`\n" @@ -565,9 +566,9 @@ def description(self) -> str: # auto_docstring class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopDenoiser` - `QwenImageLoopAfterDenoiser` @@ -575,9 +576,8 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -603,6 +603,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -630,9 +631,9 @@ def description(self) -> str: # auto_docstring class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopBeforeDenoiserControlNet` - `QwenImageLoopDenoiser` @@ -640,10 +641,8 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports text2img/img2img tasks with controlnet for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - controlnet (`QwenImageControlNetModel`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer + (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -669,6 +668,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -696,9 +696,9 @@ def description(self) -> str: # auto_docstring class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopBeforeDenoiserControlNet` - `QwenImageLoopDenoiser` @@ -707,10 +707,8 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks with controlnet for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - controlnet (`QwenImageControlNetModel`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer + (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -742,6 +740,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -777,18 +776,17 @@ def description(self) -> str: # auto_docstring class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports QwenImage Edit. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -810,6 +808,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -835,9 +834,9 @@ def description(self) -> str: # auto_docstring class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` @@ -845,9 +844,8 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks for QwenImage Edit. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -873,6 +871,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -900,18 +899,17 @@ def description(self) -> str: # auto_docstring class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports QwenImage Layered. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -933,6 +931,7 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageEditLoopBeforeDenoiser, diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 083ee507ccbb..5e1821cca5c0 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -30,7 +30,7 @@ from ...utils import logging from ...utils.torch_utils import unwrap_module from ..modular_pipeline import ModularPipelineBlocks, PipelineState -from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam from .modular_pipeline import QwenImageModularPipeline from .prompt_templates import ( QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE, @@ -277,6 +277,7 @@ def encode_vae_image( # In most of our other pipelines, resizing is done as part of the image preprocessing step. # ==================== + # auto_docstring class QwenImageEditResizeStep(ModularPipelineBlocks): """ @@ -293,8 +294,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): resized_image (`List`): The resized images """ - model_name = "qwenimage-edit" + model_name = "qwenimage-edit" @property def description(self) -> str: @@ -319,8 +320,8 @@ def inputs(self) -> List[InputParam]: def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="resized_image", - type_hint=List[PIL.Image.Image], + name="resized_image", + type_hint=List[PIL.Image.Image], description="The resized images", ), ] @@ -353,7 +354,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # auto_docstring class QwenImageLayeredResizeStep(ModularPipelineBlocks): """ - Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio. + Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while + maintaining the aspect ratio. Components: image_resize_processor (`VaeImageProcessor`) @@ -368,11 +370,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): resized_image (`List`): The resized images """ + model_name = "qwenimage-layered" @property def description(self) -> str: - return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio." + return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -399,11 +402,13 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="resized_image", - type_hint=List[PIL.Image.Image], - description="The resized images", - )] + return [ + OutputParam( + name="resized_image", + type_hint=List[PIL.Image.Image], + description="The resized images", + ) + ] @staticmethod def check_inputs(resolution: int): @@ -442,8 +447,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): class QwenImageEditPlusResizeStep(ModularPipelineBlocks): """ Resize images for QwenImage Edit Plus pipeline. - Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding. - Each image is resized independently based on its own aspect ratio. + Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text + encoding. Each image is resized independently based on its own aspect ratio. Components: image_resize_processor (`VaeImageProcessor`) @@ -484,7 +489,7 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: # image - return [InputParam.template("image")] + return [InputParam.template("image")] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -518,13 +523,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): resized_cond_images = [] for image in images: image_width, image_height = image.size - + # For VAE encoder (1024x1024 target area) vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) - resized_images.append( - components.image_resize_processor.resize(image, height=vae_height, width=vae_width) - ) - + resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width)) + # For VL text encoder (384x384 target area) vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height) resized_cond_images.append( @@ -541,16 +544,16 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # 2. GET IMAGE PROMPT # ==================== + # auto_docstring class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): """ Auto-caption step that generates a text prompt from the input image if none is provided. - Uses the VL model (text_encoder) to generate a description of the image. - If prompt is already provided, this step passes through unchanged. + Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step + passes through unchanged. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) Inputs: prompt (`str`, *optional*): @@ -590,7 +593,9 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines + InputParam.template( + "prompt", required=False + ), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -653,15 +658,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 3. TEXT ENCODER # ==================== + # auto_docstring class QwenImageTextEncoderStep(ModularPipelineBlocks): """ Text Encoder step that generates text embeddings to guide the image generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -681,6 +686,7 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. """ + model_name = "qwenimage" def __init__(self): @@ -706,7 +712,6 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -786,12 +791,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # auto_docstring class QwenImageEditTextEncoderStep(ModularPipelineBlocks): """ - Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation. + Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image + generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider + (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -811,6 +816,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. """ + model_name = "qwenimage" def __init__(self): @@ -835,7 +841,6 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -909,12 +914,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # auto_docstring class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): """ - Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation. + Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text + embeddings for guiding image generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider + (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -922,7 +927,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. resized_cond_image (`Tensor`): - The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step + The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using + resize step Outputs: prompt_embeds (`Tensor`): @@ -963,7 +969,6 @@ def expected_components(self) -> List[ComponentSpec]: ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -1042,10 +1047,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # 4. IMAGE PREPROCESS # ==================== + # auto_docstring class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width. + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be + resized to the given height and width. Components: image_mask_processor (`InpaintProcessor`) @@ -1070,6 +1077,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay """ + model_name = "qwenimage" @property @@ -1152,7 +1160,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # auto_docstring class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first. + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be + resized first. Components: image_mask_processor (`InpaintProcessor`) @@ -1173,6 +1182,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay """ + model_name = "qwenimage-edit" @property @@ -1206,11 +1216,7 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image" - ), + OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"), OutputParam( name="processed_mask_image", type_hint=torch.Tensor, @@ -1263,6 +1269,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage" @property @@ -1290,11 +1297,13 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @staticmethod def check_inputs(height, width, vae_scale_factor): @@ -1340,6 +1349,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage-edit" @property @@ -1361,7 +1371,7 @@ def expected_components(self) -> List[ComponentSpec]: def inputs(self) -> List[InputParam]: return [ InputParam( - name="resized_image", + name="resized_image", required=True, type_hint=List[PIL.Image.Image], description="The resized image. should be generated using a resize step", @@ -1370,11 +1380,13 @@ def inputs(self) -> List[InputParam]: @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1395,7 +1407,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # auto_docstring class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images. + Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of + processed images. Components: image_processor (`VaeImageProcessor`) @@ -1408,6 +1421,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage-edit-plus" @property @@ -1427,20 +1441,24 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [InputParam( - name="resized_image", - required=True, - type_hint=List[PIL.Image.Image], - description="The resized image. should be generated using a resize step", - )] + return [ + InputParam( + name="resized_image", + required=True, + type_hint=List[PIL.Image.Image], + description="The resized image. should be generated using a resize step", + ) + ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1472,6 +1490,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): # 5. VAE ENCODER # ==================== + # auto_docstring class QwenImageVaeEncoderStep(ModularPipelineBlocks): """ @@ -1509,7 +1528,9 @@ def __init__( output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents". """ if input is None: - input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode") + input = InputParam( + name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode" + ) if output is None: output = OutputParam.template("image_latents") @@ -1539,13 +1560,13 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: return [ - self._input, # default is "processed_image" + self._input, # default is "processed_image" InputParam.template("generator"), ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [self._output] # default is "image_latents" + return [self._output] # default is "image_latents" @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -1588,9 +1609,8 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): VAE Encoder step that converts `control_image` into latent representations control_image_latents. Components: - vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor + (`VaeImageProcessor`) Inputs: control_image (`Image`): @@ -1606,6 +1626,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): control_image_latents (`Tensor`): The latents representing the control image """ + model_name = "qwenimage" @property @@ -1720,6 +1741,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # 6. PERMUTE LATENTS # ==================== + # auto_docstring class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): """ @@ -1733,11 +1755,12 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): image_latents (`Tensor`): The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W]) """ + model_name = "qwenimage-layered" @property def description(self) -> str: - return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." + return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." @property def inputs(self) -> List[InputParam]: @@ -1760,4 +1783,4 @@ def __call__(self, components, state: PipelineState) -> PipelineState: block_state.image_latents = latents.permute(0, 2, 1, 3, 4) self.set_block_state(state, block_state) - return components, state \ No newline at end of file + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 0e03242e5e49..818bbca5ed0a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import torch @@ -117,7 +117,8 @@ class QwenImageTextInputsStep(ModularPipelineBlocks): 1. Determines `batch_size` and `dtype` based on `prompt_embeds` 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt) - This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps. + This block should be placed after all encoder steps to process the text embeddings before they are used in + subsequent pipeline steps. Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -145,6 +146,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. (batch-expanded) """ + model_name = "qwenimage" @property @@ -271,8 +273,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -300,7 +302,7 @@ def __init__( self, image_latent_inputs: Optional[List[InputParam]] = None, additional_batch_inputs: Optional[List[InputParam]] = None, - ): + ): # by default, process `image_latents` if image_latent_inputs is None: image_latent_inputs = [InputParam.template("image_latents")] @@ -319,7 +321,9 @@ def __init__( else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -376,13 +380,17 @@ def intermediate_outputs(self) -> List[OutputParam]: name="image_width", type_hint=int, description="The image width calculated from the image latents dimension", - ) + ), ] # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # image latent inputs are modified in place (patchified and batch-expanded) for input_param in self._image_latent_inputs: @@ -479,8 +487,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -526,7 +534,9 @@ def __init__( else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -587,11 +597,15 @@ def intermediate_outputs(self) -> List[OutputParam]: description="The image widths calculated from the image latents dimension", ), ] - + # `height`/`width` are updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # image latent inputs are modified in place (patchified, concatenated, and batch-expanded) for input_param in self._image_latent_inputs: @@ -686,11 +700,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - # same as QwenImageAdditionalInputsStep, but with layered pachifier. + # auto_docstring class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): """ Input processing step for Layered that: - 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size + 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch + size 2. For additional batch inputs: Expands batch dimensions to match final batch size Configured inputs: @@ -705,8 +721,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. @@ -720,8 +736,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): width (`int`): if not provided, updated to image width image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered - pachifier and batch-expanded) + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified + with layered pachifier and batch-expanded) """ model_name = "qwenimage-layered" @@ -748,7 +764,9 @@ def __init__( else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -808,8 +826,12 @@ def intermediate_outputs(self) -> List[OutputParam]: ] if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded) for input_param in self._image_latent_inputs: @@ -895,10 +917,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): Inputs: control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. height (`int`, *optional*): @@ -914,6 +937,7 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): width (`int`): if not provided, updated to control image width """ + model_name = "qwenimage" @property @@ -923,17 +947,26 @@ def description(self) -> str: @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."), + InputParam( + name="control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.", + ), InputParam.template("batch_size"), InputParam.template("num_images_per_prompt"), InputParam.template("height"), InputParam.template("width"), ] - + @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."), + OutputParam( + name="control_image_latents", + type_hint=torch.Tensor, + description="The control image latents (patchified and batch-expanded).", + ), OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"), OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index b50e41bb5079..5837799d3431 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -13,9 +13,10 @@ # limitations under the License. import torch + from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam from .before_denoise import ( QwenImageControlNetBeforeDenoiserStep, QwenImageCreateMaskLatentsStep, @@ -65,9 +66,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: prompt (`str`, *optional*): @@ -114,8 +114,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): - Creates `image_latents`. Components: - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) + image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`) Inputs: mask_image (`Image`): @@ -162,8 +161,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that preprocess andencode the image inputs into their latent representations. Components: - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -218,9 +216,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): - if `control_image` is not provided, step will be skipped. Components: - vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor + (`VaeImageProcessor`) Inputs: control_image (`Image`, *optional*): @@ -380,7 +377,9 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): block_classes = [ QwenImageTextInputsStep(), QwenImageAdditionalInputsStep( - additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] + additional_batch_inputs=[ + InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image") + ] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -401,15 +400,14 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the pachified latents `mask` based on the processedmask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -450,13 +448,12 @@ def description(self) -> str: # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -524,13 +521,12 @@ def outputs(self): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -606,13 +602,12 @@ def outputs(self): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -686,14 +681,12 @@ def outputs(self): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -707,7 +700,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): negative_prompt_embeds_mask (`Tensor`, *optional*): mask for the negative text embeddings. Can be generated from text_encoder step. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -773,14 +767,12 @@ def outputs(self): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -802,7 +794,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): processed_mask_image (`Tensor`, *optional*): The processed mask image control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -868,14 +861,12 @@ def outputs(self): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -895,7 +886,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -1030,12 +1022,12 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -1057,19 +1049,21 @@ def description(self): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) + vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): @@ -1125,17 +1119,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): - for text-to-image generation, all you need to provide is `prompt` Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`) + control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: prompt (`str`, *optional*): @@ -1185,7 +1173,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. control_image_latents (`Tensor`, *optional*): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. control_guidance_end (`float`, *optional*, defaults to 1.0): @@ -1195,7 +1184,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 0c1fa00842e5..e1e5c4335481 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -13,11 +13,12 @@ # limitations under the License. from typing import Optional + import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam from .before_denoise import ( QwenImageCreateMaskLatentsStep, QwenImageEditRoPEInputsStep, @@ -63,10 +64,8 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit VL encoder step that encode the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -113,9 +112,8 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -155,9 +153,8 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): - create image latents. Components: - image_resize_processor (`VaeImageProcessor`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -354,7 +351,10 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] + QwenImageAdditionalInputsStep( + additional_batch_inputs=[ + InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image") + ] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -377,15 +377,14 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the patchified latents `mask` based on the processed mask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -426,10 +425,8 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -502,10 +499,8 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit inpaint task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -623,12 +618,12 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -650,19 +645,21 @@ def description(self): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) + vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): @@ -719,19 +716,14 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: image (`Union[Image, List]`): @@ -771,7 +763,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 726c000f4b38..37656cef5d76 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InsertableDict, OutputParam from .before_denoise import ( QwenImageEditPlusRoPEInputsStep, QwenImagePrepareLatentsStep, @@ -55,10 +54,8 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -107,9 +104,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -231,10 +227,8 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -311,12 +305,12 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocesses the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -357,14 +351,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 37a06e9af254..fdfeab048835 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict, OutputParam @@ -53,14 +52,12 @@ # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -116,9 +113,8 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -203,8 +199,8 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): width (`int`): if not provided, updated to image width image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered - pachifier and batch-expanded) + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified + with layered pachifier and batch-expanded) """ model_name = "qwenimage-layered" @@ -230,10 +226,8 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Layered img2img task. Components: - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -317,16 +311,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`) + image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: image (`Union[Image, List]`): From 94525200fdbc55f1f2ed1c6ef64cba8cd990da21 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:35:39 +0100 Subject: [PATCH 19/23] rmove space in make docstring --- src/diffusers/modular_pipelines/modular_pipeline_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index a57212988e28..5468cf54d0fc 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -893,7 +893,7 @@ def make_doc_string( # Add description if description: desc_lines = description.strip().split("\n") - aligned_desc = "\n".join(" " + line for line in desc_lines) + aligned_desc = "\n".join(" " + line.rstrip() for line in desc_lines) output += aligned_desc + "\n\n" # Add components section if provided From 7e9d2b954e734d382a138d69743025eab9f7aeba Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 18 Jan 2026 22:44:44 -1000 Subject: [PATCH 20/23] Apply suggestions from code review --- src/diffusers/modular_pipelines/modular_pipeline_utils.py | 2 +- src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 5468cf54d0fc..8116f26d39a3 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa class OutputParam: """Specification for an output parameter.""" - name: str = None + name: str type_hint: Any = None description: str = "" kwargs_type: str = None diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 418d927f4faa..aae6eb50d935 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -674,7 +674,7 @@ def inputs(self) -> List[InputParam]: return [ InputParam.template("num_inference_steps"), InputParam.template("sigmas"), - InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."), + InputParam.template("image_latents"), ] @property From b7127ce7a72ddffadaf70c334effb24cf0422649 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:54:40 +0100 Subject: [PATCH 21/23] revert change in z --- src/diffusers/modular_pipelines/z_image/denoise.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py index a165fb513f3c..5f76a8459fde 100644 --- a/src/diffusers/modular_pipelines/z_image/denoise.py +++ b/src/diffusers/modular_pipelines/z_image/denoise.py @@ -129,7 +129,10 @@ def inputs(self) -> List[Tuple[str, Any]]: type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam.denoiser_input_fields(), + InputParam( + kwargs_type="denoiser_input_fields", + description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", + ), ] guider_input_names = [] uncond_guider_input_names = [] From 1f9576a2ca97c6bacef9f79b570c7b859b663b13 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:56:14 +0100 Subject: [PATCH 22/23] fix --- src/diffusers/modular_pipelines/modular_pipeline_utils.py | 2 +- src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 8116f26d39a3..f3b12d716160 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa class OutputParam: """Specification for an output parameter.""" - name: str + name: str type_hint: Any = None description: str = "" kwargs_type: str = None diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index aae6eb50d935..3c9d29260d12 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -649,8 +649,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): sigmas (`List`, *optional*): Custom sigmas for the denoising process. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be - generated from vae encoder and packed in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: timesteps (`Tensor`): From 23d06423abf84f70414d2c42908fdd03485a7cf3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 19 Jan 2026 09:23:31 +0000 Subject: [PATCH 23/23] Apply style fixes --- .../pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py | 1 - src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py | 1 - .../pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py | 1 - .../stable_diffusion/pipeline_stable_diffusion_latent_upscale.py | 1 - 4 files changed, 4 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 94c4c394465b..2ea7307fec32 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -84,7 +84,6 @@ >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL >>> from diffusers.utils import load_image - >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") >>> controlnet = ControlNetModel.from_pretrained( diff --git a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py index d259f7ee7865..b41d9772a7cc 100644 --- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py +++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py @@ -53,7 +53,6 @@ >>> from transformers import AutoTokenizer, LlamaForCausalLM >>> from diffusers import HiDreamImagePipeline - >>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct") >>> text_encoder_4 = LlamaForCausalLM.from_pretrained( ... "meta-llama/Meta-Llama-3.1-8B-Instruct", diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py index df5b3f5c10a5..5a6b8d5e9f37 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py @@ -85,7 +85,6 @@ >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL >>> from diffusers.utils import load_image - >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas") >>> controlnet = ControlNetModel.from_pretrained( diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 66d5ffa6b849..a1d0407caf5e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -459,7 +459,6 @@ def __call__( >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline >>> import torch - >>> pipeline = StableDiffusionPipeline.from_pretrained( ... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16 ... )