Skip to content

Conversation

@yiyixuxu
Copy link
Collaborator

@yiyixuxu yiyixuxu commented Jan 20, 2026

testing klein

from diffusers import ModularPipeline
import torch
import gc

device = "cuda"
dtype = torch.bfloat16

# 4b-base
repo_id = "black-forest-labs/FLUX.2-klein-base-4B"

pipeline = ModularPipeline.from_pretrained(repo_id)


pipeline.load_components(torch_dtype=dtype)
pipeline.to(device)
prompt = "a photo of a forest with mist swirling around the tree trunks. The word 'FLUX.2' is painted over it in big, red brush strokes with visible texture"
height = 768
width = 1360
num_inference_steps = 50
generator = torch.Generator(device=device).manual_seed(42)

out = pipeline(
    prompt=prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    generator=generator
).images[0]

out.save("output_4b_base.png")

del pipeline
gc.collect()
torch.cuda.empty_cache()

# 9b-base
repo_id = "black-forest-labs/FLUX.2-klein-base-9B"

pipeline = ModularPipeline.from_pretrained(repo_id)
pipeline.load_components(torch_dtype=dtype)
pipeline.to(device)
prompt = "a photo of a forest with mist swirling around the tree trunks. The word 'FLUX.2' is painted over it in big, red brush strokes with visible texture"
height = 768
width = 1360
num_inference_steps = 50
generator = torch.Generator(device=device).manual_seed(42)

out = pipeline(
    prompt=prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    generator=generator
).images[0]

out.save("output_9b_base.png")


# 4b

repo_id = "black-forest-labs/FLUX.2-klein-4B"
pipeline = ModularPipeline.from_pretrained(repo_id)
pipeline.load_components(torch_dtype=dtype)
pipeline.to(device)

num_inference_steps = 4
generator = torch.Generator(device=device).manual_seed(42)

out = pipeline(
    prompt=prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    generator=generator
).images[0]

out.save("output_4b.png")

del pipeline
gc.collect()
torch.cuda.empty_cache()

# 9b
repo_id = "black-forest-labs/FLUX.2-klein-9B"
pipeline = ModularPipeline.from_pretrained(repo_id)
pipeline.load_components(torch_dtype=dtype)
pipeline.to(device)

num_inference_steps = 4
generator = torch.Generator(device=device).manual_seed(42)

out = pipeline(
    prompt=prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    generator=generator
).images[0]

out.save("output_9b.png")

del pipeline
gc.collect()
torch.cuda.empty_cache()

testing klein with modular setting

from diffusers import ModularPipeline
import torch
import gc

device = "cuda"
dtype = torch.bfloat16

prompt = "a photo of a forest with mist swirling around the tree trunks. The word 'FLUX.2' is painted over it in big, red brush strokes with visible texture"
height = 768
width = 1360
# 4b

repo_ids = ["black-forest-labs/FLUX.2-klein-4B", "black-forest-labs/FLUX.2-klein-9B", "black-forest-labs/FLUX.2-klein-base-4B", "black-forest-labs/FLUX.2-klein-base-9B"]
for repo_id in repo_ids:
    print(f" =========================================")
    print(f" {repo_id}")
    blocks = ModularPipeline.from_pretrained(repo_id).blocks

    text_encoder_node = blocks.sub_blocks["text_encoder"].init_pipeline(repo_id)
    vae_encoder_node = blocks.sub_blocks["vae_encoder"].init_pipeline(repo_id)
    decoder_node = blocks.sub_blocks["decode"].init_pipeline(repo_id)
    denoise_node = blocks.sub_blocks["denoise"].init_pipeline(repo_id)


    text_encoder_node.load_components(torch_dtype=dtype)
    text_encoder_node.to(device)
    print(" ")
    print(f" text_encoder block: {text_encoder_node.blocks.doc}")
    text_embeddings = text_encoder_node(prompt=prompt).get_by_kwargs("denoiser_input_fields")

    denoise_node.load_components(torch_dtype=dtype)
    denoise_node.to(device)
    print(" ")
    print(f" denoise block: {denoise_node.blocks.doc}")
    if "base" in repo_id:
        num_inference_steps = 50
    else:
        num_inference_steps = 4

    latents = denoise_node(
        **text_embeddings, 
        height=height, 
        width=width, 
        num_inference_steps=num_inference_steps, 
        generator=torch.Generator(device=device).manual_seed(42)
        ).latents

    decoder_node.load_components(torch_dtype=dtype)
    decoder_node.to(device)
    print(" ")
    print(f" decoder block: {decoder_node.blocks.doc}")
    print(" ")
    image = decoder_node(latents=latents).images[0]
    image.save(f"output_{repo_id.split("/")[-1]}.png")

testing flux2-dev

from diffusers import ModularPipeline, ComponentsManager
from transformers import AutoProcessor
import torch

device = "cuda"
dtype = torch.bfloat16
repo_id = "black-forest-labs/FLUX.2-dev"

components = ComponentsManager()
# there is an issue in transformer `sub_folder` arg is ignored 
cached_tokenizer_path = "/local/path/to/tokenizer/folder"
tokenizer = PixtralProcessor.from_pretrained(cached_tokenizer_path)

pipeline = ModularPipeline.from_pretrained(repo_id, components_manager=components)
pipeline.update_components(tokenizer=tokenizer)
pipeline.load_components(torch_dtype=dtype)

components.enable_auto_cpu_offload(device=device)

prompt = "a photo of a forest with mist swirling around the tree trunks. The word 'FLUX.2' is painted over it in big, red brush strokes with visible texture"
height = 768
width = 1360
num_inference_steps = 50
generator = torch.Generator(device=device).manual_seed(42)

print(f" pipeline: {pipeline.blocks.doc}")

out = pipeline(
    prompt=prompt,
    height=height,
    width=width,
    num_inference_steps=num_inference_steps,
    generator=generator
).images[0]

out.save("output_dev.png")

modular seetting

from diffusers import ModularPipeline, ComponentsManager
from transformers import PixtralProcessor
import torch

device = "cuda"
dtype = torch.bfloat16

components = ComponentsManager()
# dev
repo_id = "black-forest-labs/FLUX.2-dev"

blocks = ModularPipeline.from_pretrained(repo_id).blocks
text_node = blocks.sub_blocks["text_encoder"].init_pipeline(repo_id, components_manager=components)
denoise_node = blocks.sub_blocks["denoise"].init_pipeline(repo_id, components_manager=components)
decoder_node = blocks.sub_blocks["decode"].init_pipeline(repo_id, components_manager=components)


cached_tokenizer_path = "/path/to/local/folder/tokenizer"
tokenizer = PixtralProcessor.from_pretrained(cached_tokenizer_path)
print(f" tokenizer: {tokenizer}")

text_node.update_components(tokenizer=tokenizer)
text_node.load_components(torch_dtype=dtype)
denoise_node.load_components(torch_dtype=dtype)
decoder_node.load_components(torch_dtype=dtype)

components.enable_auto_cpu_offload(device=device)

prompt = "a photo of a forest with mist swirling around the tree trunks. The word 'FLUX.2' is painted over it in big, red brush strokes with visible texture"
height = 768
width = 1360
num_inference_steps = 50
generator = torch.Generator(device=device).manual_seed(42)

print(f" text_node: {text_node.blocks.doc}")
text_embeddings = text_node(prompt=prompt).get_by_kwargs("denoiser_input_fields")

print(f" denoise_node: {denoise_node.blocks.doc}")
latents = denoise_node(**text_embeddings, height=height, width=width, num_inference_steps=num_inference_steps, generator=generator).latents
# print(f" latents: {latents}")

print(f" decoder_node: {decoder_node.blocks.doc}")
image = decoder_node(latents=latents).images[0]
print(f" image: {image}")
image.save("output_dev.png")

def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt_embeds", required=True),
InputParam(name="latent_ids"),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed because latent_ids are not used in this block I think

def inputs(self) -> List[InputParam]:
return [
InputParam("prompt"),
InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in modular, we can just pop out the text_encoder block to use in standalone manner if we want to compute them separately. so I'm trying to remove these arguments everywhere to simplify code a bit (same with image_latents too, it gets really complicated for some inpaiting pipeline)

blocks = ....
text_node = blocks.pop("text_encoder").init_pipeline(repo_id)

pipe = blocks.init_pipeline(repo_id)
prompt_embeds = text_node(prompt = ..)
out = pipe(prompt_embeds = ...)

@HuggingFaceDocBuilderDev

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

yiyi@huggingface.co added 2 commits January 20, 2026 01:31
@yiyixuxu yiyixuxu requested a review from DN6 January 20, 2026 01:34
Copy link
Member

@sayakpaul sayakpaul left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a requirement but should we add a small test for it as well? 👀

return [
InputParam(name="prompt_embeds", required=True),
InputParam(name="latent_ids"),
InputParam(name="negative_prompt_embeds", required=False),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No strong opinions but WDYT of creating a separate block for Klein altogether? I think this way it will be a bit easier to debug and also separate concerns?

My suggestions mainly comes from the fact that Flux.2-Dev doesn't use negative_prompt_embeds while Flux.2-Klein does. So, maybe that warrants creating separate blocks.

Copy link
Collaborator Author

@yiyixuxu yiyixuxu Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a fair point, but on the other hand, I've personally found that having too many blocks can become overwhelming - each time you need to add something, you still need to go through all of them and understand which ones to use.
I think it makes sense to just add the code in the same blocks here, it is so small and fits in. but this is really a matter of preference, not right or wrong. Maybe we'll know better in the future though after building more pipelines :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I changed my mind - I agree it's better to separate them out. Otherwise negative_prompt_embeds will show up as an optional argument in the auto docstring for both Klein and Dev, which is confusing.
Note that in Qwen (https://github.com/huggingface/diffusers/blob/main/src/diffusers/modular_pipelines/qwenimage/inputs.py#L232), I'm experimenting with more composable blocks for situations like this that you can just reuse. But it also makes the blocks more complex, and I'm not sure if I'm over-engineering. So let's keep them simple here and see how it goes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!

Comment on lines +71 to +75
def get_default_blocks_name(self, config_dict: Optional[Dict[str, Any]]) -> Optional[str]:
if config_dict is not None and "is_distilled" in config_dict and config_dict["is_distilled"]:
return "Flux2KleinAutoBlocks"
else:
return "Flux2KleinBaseAutoBlocks"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe naming them as Flux2KleinDistilledBlocks and Flux2KleinBaseAutoBlocks is slightly better?

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>
block_state.timesteps = timesteps
block_state.num_inference_steps = num_inference_steps

batch_size = block_state.batch_size * block_state.num_images_per_prompt
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

separated this to a prepare_guidance block



class Flux2DecodeStep(ModularPipelineBlocks):
class Flux2UnpackLatentsStep(ModularPipelineBlocks):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move the "unpacking latent' out of decode step -> this is just to make it easier for decode step work in standalone manner, i.e. user only need to pass latent (not latent_id)



AUTO_BLOCKS = InsertableDict(
[
("text_encoder", Flux2TextEncoderStep()),
("text_input", Flux2TextInputStep()),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rearrangeed a bit so this works with mellon

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants