From eda4a31adf22e8dea301b17d3794a93123f9bd06 Mon Sep 17 00:00:00 2001 From: Jinghan Li Date: Fri, 5 Dec 2025 06:58:06 +0000 Subject: [PATCH] Introduce AutoPipelineForText2Video (simple) --- auto_pipeline_test.py | 27 +++++++++++++++ src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 1 + src/diffusers/pipelines/auto_pipeline.py | 42 +++++++++++++++++++++++- 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 auto_pipeline_test.py diff --git a/auto_pipeline_test.py b/auto_pipeline_test.py new file mode 100644 index 000000000000..50604d19abb6 --- /dev/null +++ b/auto_pipeline_test.py @@ -0,0 +1,27 @@ +import torch +from diffusers import AutoPipelineForText2Video +from diffusers.utils import export_to_video + +wan_list = [ + "Wan-AI/Wan2.1-T2V-14B-Diffusers", + "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers", + "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers", + "Wan-AI/Wan2.1-VACE-1.3B-diffusers", + "Wan-AI/Wan2.2-I2V-A14B-Diffusers", +] + +pipe = AutoPipelineForText2Video.from_pretrained( + wan_list[3], + #torch_dtype=torch.float16, +) + +print(pipe.text_encoder.__class__.__name__) + +# img = torch.randn(1, 3, 10, 512, 512) # batch 1, 3 channels, 512x512 +# latent = pipe.vae.encode(img).latent_dist.mode() # encoder output +# print("Latent shape:", latent.shape) + +# #Latent shape: torch.Size([1, 16, 3, 64, 64]) + +# recon =pipe.vae.decode(latent).sample +# print("Reconstructed image shape:", recon.shape) \ No newline at end of file diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 6df4ad489415..706792507f9a 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -305,6 +305,7 @@ "AutoPipelineForImage2Image", "AutoPipelineForInpainting", "AutoPipelineForText2Image", + "AutoPipelineForText2Video", "ConsistencyModelPipeline", "DanceDiffusionPipeline", "DDIMPipeline", diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 388551f812f8..d854b8457fa5 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -46,6 +46,7 @@ "AutoPipelineForImage2Image", "AutoPipelineForInpainting", "AutoPipelineForText2Image", + "AutoPipelineForText2Video", ] _import_structure["consistency_models"] = ["ConsistencyModelPipeline"] _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"] diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index db0268a2a73d..28dd5b5be135 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -117,7 +117,7 @@ StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) -from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline +from .wan import WanAnimatePipeline, WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline from .z_image import ZImageImg2ImgPipeline, ZImagePipeline @@ -221,6 +221,10 @@ AUTO_TEXT2VIDEO_PIPELINES_MAPPING = OrderedDict( [ ("wan", WanPipeline), + ("wan-animate", WanAnimatePipeline), + ("wan-image-to-video", WanImageToVideoPipeline), + ("wan-vace", WanVACEPipeline), + ("wan-video-to-video", WanVideoToVideoPipeline), ] ) @@ -1206,3 +1210,39 @@ def from_pipe(cls, pipeline, **kwargs): model.register_to_config(**unused_original_config) return model + + +class AutoPipelineForText2Video(ConfigMixin): + config_name = "model_index.json" + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_pipe(pipeline)` methods." + ) + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + + load_config_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "token": token, + "local_files_only": local_files_only, + "revision": revision, + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + orig_class_name = config["_class_name"] + text_to_video_cls = _get_task_class(AUTO_TEXT2VIDEO_PIPELINES_MAPPING, orig_class_name) + kwargs = {**load_config_kwargs, **kwargs} + return text_to_video_cls.from_pretrained(pretrained_model_or_path, **kwargs)