feedback

stevhliu · stevhliu · commit 2a4090b81914 · 2025-04-23T11:14:18.000-07:00
diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
@@ -39,34 +39,33 @@ The quantized CogVideoX 5B model below requires ~16GB of VRAM.
 
 ```py
 import torch
-from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
+from diffusers import CogVideoXPipeline, AutoModel, TorchAoConfig
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
 
 # quantize weights to int8 with torchao
 quantization_config = TorchAoConfig("int8wo")
-transformer = CogVideoXTransformer3DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "THUDM/CogVideoX-5b",
     subfolder="transformer",
     quantization_config=quantization_config,
     torch_dtype=torch.bfloat16,
 )
 
 # fp8 layerwise weight-casting
-transformer = CogVideoXTransformer3DModel.from_pretrained(
-  "THUDM/CogVideoX-5b",
-  subfolder="transformer",
-  torch_dtype=torch.bfloat16
+transformer = AutoModel.from_pretrained(
+    "THUDM/CogVideoX-5b",
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16
 )
 transformer.enable_layerwise_casting(
-  storage_dtype=torch.float8_e4m3fn,
-  compute_dtype=torch.bfloat16
+    storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
 )
 
 pipeline = CogVideoXPipeline.from_pretrained(
-  "THUDM/CogVideoX-5b",
-  transformer=transformer,
-  torch_dtype=torch.bfloat16
+    "THUDM/CogVideoX-5b",
+    transformer=transformer,
+    torch_dtype=torch.bfloat16
 )
 pipeline.to("cuda")
 
@@ -81,9 +80,9 @@ with the toy ship's journey symbolizing endless adventures in a whimsical, indoo
 """
 
 video = pipeline(
-  prompt=prompt,
-  guidance_scale=6,
-  num_inference_steps=50
+    prompt=prompt,
+    guidance_scale=6,
+    num_inference_steps=50
 ).frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```
@@ -95,12 +94,12 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
 
 ```py
 import torch
-from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
+from diffusers import CogVideoXPipeline
 from diffusers.utils import export_to_video
 
 pipeline = CogVideoXPipeline.from_pretrained(
-  "THUDM/CogVideoX-2b",
-  torch_dtype=torch.float16
+    "THUDM/CogVideoX-2b",
+    torch_dtype=torch.float16
 ).to("cuda")
 
 # torch.compile
@@ -117,9 +116,9 @@ with the toy ship's journey symbolizing endless adventures in a whimsical, indoo
 """
 
 video = pipeline(
-  prompt=prompt,
-  guidance_scale=6,
-  num_inference_steps=50
+    prompt=prompt,
+    guidance_scale=6,
+    num_inference_steps=50
 ).frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```
@@ -133,13 +132,13 @@ export_to_video(video, "output.mp4", fps=8)
 
   ```py
   import torch
-  from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
+  from diffusers import CogVideoXPipeline
   from diffusers.hooks import apply_group_offloading
   from diffusers.utils import export_to_video
 
   pipeline = CogVideoXPipeline.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    torch_dtype=torch.bfloat16
+      "THUDM/CogVideoX-5b",
+      torch_dtype=torch.bfloat16
   )
   pipeline.to("cuda")
 
diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md
@@ -25,6 +25,8 @@
 You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
 
 > [!TIP]
+> Click on the HunyuanVideo models in the right sidebar for more examples of video generation tasks.
+>
 > The examples below use a checkpoint from [hunyuanvideo-community](https://huggingface.co/hunyuanvideo-community) because the weights are stored in a layout compatible with Diffusers.
 
 The example below demonstrates how to generate a video optimized for memory or inference speed.
@@ -38,12 +40,12 @@ The quantized HunyuanVideo model below requires ~14GB of VRAM.
 
 ```py
 import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
 from diffusers.utils import export_to_video
 
 # quantize weights to int4 with bitsandbytes
 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "hunyuanvideo-community/HunyuanVideo",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -72,12 +74,12 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
 
 ```py
 import torch
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
 from diffusers.utils import export_to_video
 
 # quantize weights to int4 with bitsandbytes
 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "hunyuanvideo-community/HunyuanVideo",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -114,12 +116,12 @@ export_to_video(video, "output.mp4", fps=15)
 
   ```py
   import torch
-  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
   from diffusers.utils import export_to_video
 
   # quantize weights to int4 with bitsandbytes
   quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
-  transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+  transformer = AutoModel.from_pretrained(
       "hunyuanvideo-community/HunyuanVideo",
       subfolder="transformer",
       quantization_config=quant_config,
@@ -157,7 +159,7 @@ export_to_video(video, "output.mp4", fps=15)
   | vae dtype | `torch.float16` |
   | `num_frames (k)` | 4 * `k` + 1 |
 
-- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
+- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
 
 ## HunyuanVideoPipeline
 
diff --git a/docs/source/en/api/pipelines/ltx_video.md b/docs/source/en/api/pipelines/ltx_video.md
@@ -38,19 +38,18 @@ The LTX-Video model below requires ~10GB of VRAM.
 
 ```py
 import torch
-from diffusers import LTXPipeline, LTXVideoTransformer3DModel
+from diffusers import LTXPipeline, AutoModel
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video
 
 # fp8 layerwise weight-casting
-transformer = LTXVideoTransformer3DModel.from_pretrained(
-  "Lightricks/LTX-Video",
-  subfolder="transformer",
-  torch_dtype=torch.bfloat16
+transformer = AutoModel.from_pretrained(
+    "Lightricks/LTX-Video",
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16
 )
 transformer.enable_layerwise_casting(
-  storage_dtype=torch.float8_e4m3fn,
-  compute_dtype=torch.bfloat16
+    storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
 )
 
 pipeline = LTXPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, torch_dtype=torch.bfloat16)
@@ -159,17 +158,17 @@ export_to_video(video, "output.mp4", fps=24)
   ```py
   import torch
   from diffusers.utils import export_to_video
-  from diffusers import LTXPipeline, LTXVideoTransformer3DModel, GGUFQuantizationConfig
+  from diffusers import LTXPipeline, AutoModel, GGUFQuantizationConfig
 
-  transformer = LTXVideoTransformer3DModel.from_single_file(
-    "https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf"
-    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-    torch_dtype=torch.bfloat16
+  transformer = AutoModel.from_single_file(
+      "https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf"
+      quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+      torch_dtype=torch.bfloat16
   )
   pipeline = LTXPipeline.from_pretrained(
-    "Lightricks/LTX-Video",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
+      "Lightricks/LTX-Video",
+      transformer=transformer,
+      torch_dtype=torch.bfloat16
   )
   ```
 
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
@@ -25,7 +25,7 @@
 You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.
 
 > [!TIP]
-> Click on the Wan2.1 models in the right sidebar for more examples of other video generation tasks.
+> Click on the Wan2.1 models in the right sidebar for more examples of video generation.
 
 The example below demonstrates how to generate a video from text optimized for memory or inference speed.
 
@@ -38,17 +38,16 @@ The Wan2.1 text-to-video model below requires ~13GB of VRAM.
 
 ```py
 # pip install ftfy
-
 import torch
 import numpy as np
-from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanPipeline
+from diffusers import AutoModel, WanPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionModel
+from transformers import UMT5EncoderModel
 
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLWan.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
-transformer = WanTransformer3DModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
+transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 
 # group-offloading
 onload_device = torch.device("cuda")
@@ -67,7 +66,7 @@ transformer.enable_group_offload(
 )
 
 pipeline = WanPipeline.from_pretrained(
-    model_id,
+    "Wan-AI/Wan2.1-T2V-14B-Diffusers",
     vae=vae,
     transformer=transformer,
     text_encoder=text_encoder,
@@ -104,20 +103,19 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
 
 ```py
 # pip install ftfy
-
 import torch
 import numpy as np
-from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanPipeline
+from diffusers import AutoModel, WanPipeline
 from diffusers.hooks.group_offloading import apply_group_offloading
 from diffusers.utils import export_to_video, load_image
-from transformers import UMT5EncoderModel, CLIPVisionModel
+from transformers import UMT5EncoderModel
 
 text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLWan.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
-transformer = WanTransformer3DModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
+transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 
 pipeline = WanPipeline.from_pretrained(
-    model_id,
+    "Wan-AI/Wan2.1-T2V-14B-Diffusers",
     vae=vae,
     transformer=transformer,
     text_encoder=text_encoder,
@@ -162,20 +160,19 @@ export_to_video(output, "output.mp4", fps=16)
 
   ```py
   # pip install ftfy
-
   import torch
-  from diffusers import WanPipeline
+  from diffusers import AutoModel, WanPipeline
   from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
   from diffusers.utils import export_to_video
 
-  vae = AutoencoderKLWan.from_pretrained(
-    "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
+  vae = AutoModel.from_pretrained(
+      "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
   )
   pipeline = WanPipeline.from_pretrained(
-    "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", vae=vae, torch_dtype=torch.bfloat16
+      "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", vae=vae, torch_dtype=torch.bfloat16
   )
   pipeline.scheduler = UniPCMultistepScheduler.from_config(
-    pipeline.scheduler.config, flow_shift=5.0
+      pipeline.scheduler.config, flow_shift=5.0
   )
   pipeline.to("cuda")
 
@@ -194,9 +191,9 @@ export_to_video(output, "output.mp4", fps=16)
   """
 
   output = pipeline(
-    prompt=prompt,
-    num_frames=81,
-    guidance_scale=5.0,
+      prompt=prompt,
+      num_frames=81,
+      guidance_scale=5.0,
   ).frames[0]
   export_to_video(output, "output.mp4", fps=16)
   ```
@@ -205,30 +202,29 @@ export_to_video(output, "output.mp4", fps=16)
 
   ```py
   # pip install ftfy
-
   import torch
-  from diffusers import WanPipeline, WanTransformer3DModel, AutoencoderKLWan
+  from diffusers import WanPipeline, AutoModel
 
-  vae = AutoencoderKLWan.from_single_file(
-    "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
+  vae = AutoModel.from_single_file(
+      "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
   )
-  transformer = WanTransformer3DModel.from_single_file(
-    "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors",
-    torch_dtype=torch.bfloat16
+  transformer = AutoModel.from_single_file(
+      "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors",
+      torch_dtype=torch.bfloat16
   )
   pipeline = WanPipeline.from_pretrained(
-    "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
-    vae=vae,
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
+      "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+      vae=vae,
+      transformer=transformer,
+      torch_dtype=torch.bfloat16
   )
   ```
 
 - Set the [`AutoencoderKLWan`] dtype to `torch.float32` for better decoding quality.
 
 - The number of frames per second (fps) or `k` should be calculated by `4 * k + 1`.
 
-- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
+- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
 
 ## WanPipeline
 
diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md