Skip to content

Commit 2a4090b

Browse files
committed
feedback
1 parent a7acbb4 commit 2a4090b

File tree

5 files changed

+121
-121
lines changed

5 files changed

+121
-121
lines changed

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,34 +39,33 @@ The quantized CogVideoX 5B model below requires ~16GB of VRAM.
3939

4040
```py
4141
import torch
42-
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
42+
from diffusers import CogVideoXPipeline, AutoModel, TorchAoConfig
4343
from diffusers.hooks import apply_group_offloading
4444
from diffusers.utils import export_to_video
4545

4646
# quantize weights to int8 with torchao
4747
quantization_config = TorchAoConfig("int8wo")
48-
transformer = CogVideoXTransformer3DModel.from_pretrained(
48+
transformer = AutoModel.from_pretrained(
4949
"THUDM/CogVideoX-5b",
5050
subfolder="transformer",
5151
quantization_config=quantization_config,
5252
torch_dtype=torch.bfloat16,
5353
)
5454

5555
# fp8 layerwise weight-casting
56-
transformer = CogVideoXTransformer3DModel.from_pretrained(
57-
"THUDM/CogVideoX-5b",
58-
subfolder="transformer",
59-
torch_dtype=torch.bfloat16
56+
transformer = AutoModel.from_pretrained(
57+
"THUDM/CogVideoX-5b",
58+
subfolder="transformer",
59+
torch_dtype=torch.bfloat16
6060
)
6161
transformer.enable_layerwise_casting(
62-
storage_dtype=torch.float8_e4m3fn,
63-
compute_dtype=torch.bfloat16
62+
storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
6463
)
6564

6665
pipeline = CogVideoXPipeline.from_pretrained(
67-
"THUDM/CogVideoX-5b",
68-
transformer=transformer,
69-
torch_dtype=torch.bfloat16
66+
"THUDM/CogVideoX-5b",
67+
transformer=transformer,
68+
torch_dtype=torch.bfloat16
7069
)
7170
pipeline.to("cuda")
7271

@@ -81,9 +80,9 @@ with the toy ship's journey symbolizing endless adventures in a whimsical, indoo
8180
"""
8281

8382
video = pipeline(
84-
prompt=prompt,
85-
guidance_scale=6,
86-
num_inference_steps=50
83+
prompt=prompt,
84+
guidance_scale=6,
85+
num_inference_steps=50
8786
).frames[0]
8887
export_to_video(video, "output.mp4", fps=8)
8988
```
@@ -95,12 +94,12 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
9594

9695
```py
9796
import torch
98-
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
97+
from diffusers import CogVideoXPipeline
9998
from diffusers.utils import export_to_video
10099

101100
pipeline = CogVideoXPipeline.from_pretrained(
102-
"THUDM/CogVideoX-2b",
103-
torch_dtype=torch.float16
101+
"THUDM/CogVideoX-2b",
102+
torch_dtype=torch.float16
104103
).to("cuda")
105104

106105
# torch.compile
@@ -117,9 +116,9 @@ with the toy ship's journey symbolizing endless adventures in a whimsical, indoo
117116
"""
118117

119118
video = pipeline(
120-
prompt=prompt,
121-
guidance_scale=6,
122-
num_inference_steps=50
119+
prompt=prompt,
120+
guidance_scale=6,
121+
num_inference_steps=50
123122
).frames[0]
124123
export_to_video(video, "output.mp4", fps=8)
125124
```
@@ -133,13 +132,13 @@ export_to_video(video, "output.mp4", fps=8)
133132

134133
```py
135134
import torch
136-
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
135+
from diffusers import CogVideoXPipeline
137136
from diffusers.hooks import apply_group_offloading
138137
from diffusers.utils import export_to_video
139138

140139
pipeline = CogVideoXPipeline.from_pretrained(
141-
"THUDM/CogVideoX-5b",
142-
torch_dtype=torch.bfloat16
140+
"THUDM/CogVideoX-5b",
141+
torch_dtype=torch.bfloat16
143142
)
144143
pipeline.to("cuda")
145144

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
2626

2727
> [!TIP]
28+
> Click on the HunyuanVideo models in the right sidebar for more examples of video generation tasks.
29+
>
2830
> The examples below use a checkpoint from [hunyuanvideo-community](https://huggingface.co/hunyuanvideo-community) because the weights are stored in a layout compatible with Diffusers.
2931
3032
The example below demonstrates how to generate a video optimized for memory or inference speed.
@@ -38,12 +40,12 @@ The quantized HunyuanVideo model below requires ~14GB of VRAM.
3840

3941
```py
4042
import torch
41-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
43+
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
4244
from diffusers.utils import export_to_video
4345

4446
# quantize weights to int4 with bitsandbytes
4547
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
46-
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
48+
transformer = AutoModel.from_pretrained(
4749
"hunyuanvideo-community/HunyuanVideo",
4850
subfolder="transformer",
4951
quantization_config=quant_config,
@@ -72,12 +74,12 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
7274

7375
```py
7476
import torch
75-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
77+
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
7678
from diffusers.utils import export_to_video
7779

7880
# quantize weights to int4 with bitsandbytes
7981
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
80-
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
82+
transformer = AutoModel.from_pretrained(
8183
"hunyuanvideo-community/HunyuanVideo",
8284
subfolder="transformer",
8385
quantization_config=quant_config,
@@ -114,12 +116,12 @@ export_to_video(video, "output.mp4", fps=15)
114116

115117
```py
116118
import torch
117-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
119+
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
118120
from diffusers.utils import export_to_video
119121

120122
# quantize weights to int4 with bitsandbytes
121123
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
122-
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
124+
transformer = AutoModel.from_pretrained(
123125
"hunyuanvideo-community/HunyuanVideo",
124126
subfolder="transformer",
125127
quantization_config=quant_config,
@@ -157,7 +159,7 @@ export_to_video(video, "output.mp4", fps=15)
157159
| vae dtype | `torch.float16` |
158160
| `num_frames (k)` | 4 * `k` + 1 |
159161

160-
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
162+
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
161163

162164
## HunyuanVideoPipeline
163165

docs/source/en/api/pipelines/ltx_video.md

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,19 +38,18 @@ The LTX-Video model below requires ~10GB of VRAM.
3838

3939
```py
4040
import torch
41-
from diffusers import LTXPipeline, LTXVideoTransformer3DModel
41+
from diffusers import LTXPipeline, AutoModel
4242
from diffusers.hooks import apply_group_offloading
4343
from diffusers.utils import export_to_video
4444

4545
# fp8 layerwise weight-casting
46-
transformer = LTXVideoTransformer3DModel.from_pretrained(
47-
"Lightricks/LTX-Video",
48-
subfolder="transformer",
49-
torch_dtype=torch.bfloat16
46+
transformer = AutoModel.from_pretrained(
47+
"Lightricks/LTX-Video",
48+
subfolder="transformer",
49+
torch_dtype=torch.bfloat16
5050
)
5151
transformer.enable_layerwise_casting(
52-
storage_dtype=torch.float8_e4m3fn,
53-
compute_dtype=torch.bfloat16
52+
storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16
5453
)
5554

5655
pipeline = LTXPipeline.from_pretrained("Lightricks/LTX-Video", transformer=transformer, torch_dtype=torch.bfloat16)
@@ -159,17 +158,17 @@ export_to_video(video, "output.mp4", fps=24)
159158
```py
160159
import torch
161160
from diffusers.utils import export_to_video
162-
from diffusers import LTXPipeline, LTXVideoTransformer3DModel, GGUFQuantizationConfig
161+
from diffusers import LTXPipeline, AutoModel, GGUFQuantizationConfig
163162

164-
transformer = LTXVideoTransformer3DModel.from_single_file(
165-
"https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf"
166-
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
167-
torch_dtype=torch.bfloat16
163+
transformer = AutoModel.from_single_file(
164+
"https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf"
165+
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
166+
torch_dtype=torch.bfloat16
168167
)
169168
pipeline = LTXPipeline.from_pretrained(
170-
"Lightricks/LTX-Video",
171-
transformer=transformer,
172-
torch_dtype=torch.bfloat16
169+
"Lightricks/LTX-Video",
170+
transformer=transformer,
171+
torch_dtype=torch.bfloat16
173172
)
174173
```
175174

docs/source/en/api/pipelines/wan.md

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.
2626

2727
> [!TIP]
28-
> Click on the Wan2.1 models in the right sidebar for more examples of other video generation tasks.
28+
> Click on the Wan2.1 models in the right sidebar for more examples of video generation.
2929
3030
The example below demonstrates how to generate a video from text optimized for memory or inference speed.
3131

@@ -38,17 +38,16 @@ The Wan2.1 text-to-video model below requires ~13GB of VRAM.
3838

3939
```py
4040
# pip install ftfy
41-
4241
import torch
4342
import numpy as np
44-
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanPipeline
43+
from diffusers import AutoModel, WanPipeline
4544
from diffusers.hooks.group_offloading import apply_group_offloading
4645
from diffusers.utils import export_to_video, load_image
47-
from transformers import UMT5EncoderModel, CLIPVisionModel
46+
from transformers import UMT5EncoderModel
4847

4948
text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
50-
vae = AutoencoderKLWan.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
51-
transformer = WanTransformer3DModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
49+
vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
50+
transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
5251

5352
# group-offloading
5453
onload_device = torch.device("cuda")
@@ -67,7 +66,7 @@ transformer.enable_group_offload(
6766
)
6867

6968
pipeline = WanPipeline.from_pretrained(
70-
model_id,
69+
"Wan-AI/Wan2.1-T2V-14B-Diffusers",
7170
vae=vae,
7271
transformer=transformer,
7372
text_encoder=text_encoder,
@@ -104,20 +103,19 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
104103

105104
```py
106105
# pip install ftfy
107-
108106
import torch
109107
import numpy as np
110-
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanPipeline
108+
from diffusers import AutoModel, WanPipeline
111109
from diffusers.hooks.group_offloading import apply_group_offloading
112110
from diffusers.utils import export_to_video, load_image
113-
from transformers import UMT5EncoderModel, CLIPVisionModel
111+
from transformers import UMT5EncoderModel
114112

115113
text_encoder = UMT5EncoderModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="text_encoder", torch_dtype=torch.bfloat16)
116-
vae = AutoencoderKLWan.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
117-
transformer = WanTransformer3DModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
114+
vae = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32)
115+
transformer = AutoModel.from_pretrained("Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
118116

119117
pipeline = WanPipeline.from_pretrained(
120-
model_id,
118+
"Wan-AI/Wan2.1-T2V-14B-Diffusers",
121119
vae=vae,
122120
transformer=transformer,
123121
text_encoder=text_encoder,
@@ -162,20 +160,19 @@ export_to_video(output, "output.mp4", fps=16)
162160

163161
```py
164162
# pip install ftfy
165-
166163
import torch
167-
from diffusers import WanPipeline
164+
from diffusers import AutoModel, WanPipeline
168165
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
169166
from diffusers.utils import export_to_video
170167

171-
vae = AutoencoderKLWan.from_pretrained(
172-
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
168+
vae = AutoModel.from_pretrained(
169+
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
173170
)
174171
pipeline = WanPipeline.from_pretrained(
175-
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", vae=vae, torch_dtype=torch.bfloat16
172+
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers", vae=vae, torch_dtype=torch.bfloat16
176173
)
177174
pipeline.scheduler = UniPCMultistepScheduler.from_config(
178-
pipeline.scheduler.config, flow_shift=5.0
175+
pipeline.scheduler.config, flow_shift=5.0
179176
)
180177
pipeline.to("cuda")
181178

@@ -194,9 +191,9 @@ export_to_video(output, "output.mp4", fps=16)
194191
"""
195192

196193
output = pipeline(
197-
prompt=prompt,
198-
num_frames=81,
199-
guidance_scale=5.0,
194+
prompt=prompt,
195+
num_frames=81,
196+
guidance_scale=5.0,
200197
).frames[0]
201198
export_to_video(output, "output.mp4", fps=16)
202199
```
@@ -205,30 +202,29 @@ export_to_video(output, "output.mp4", fps=16)
205202

206203
```py
207204
# pip install ftfy
208-
209205
import torch
210-
from diffusers import WanPipeline, WanTransformer3DModel, AutoencoderKLWan
206+
from diffusers import WanPipeline, AutoModel
211207

212-
vae = AutoencoderKLWan.from_single_file(
213-
"https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
208+
vae = AutoModel.from_single_file(
209+
"https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
214210
)
215-
transformer = WanTransformer3DModel.from_single_file(
216-
"https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors",
217-
torch_dtype=torch.bfloat16
211+
transformer = AutoModel.from_single_file(
212+
"https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors",
213+
torch_dtype=torch.bfloat16
218214
)
219215
pipeline = WanPipeline.from_pretrained(
220-
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
221-
vae=vae,
222-
transformer=transformer,
223-
torch_dtype=torch.bfloat16
216+
"Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
217+
vae=vae,
218+
transformer=transformer,
219+
torch_dtype=torch.bfloat16
224220
)
225221
```
226222

227223
- Set the [`AutoencoderKLWan`] dtype to `torch.float32` for better decoding quality.
228224

229225
- The number of frames per second (fps) or `k` should be calculated by `4 * k + 1`.
230226

231-
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos, and try higher `shift` values (`7.0` to `12.0`) for higher resolution images.
227+
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
232228

233229
## WanPipeline
234230

0 commit comments

Comments
 (0)