25
25
You can find all the original Wan2.1 checkpoints under the [ Wan-AI] ( https://huggingface.co/Wan-AI ) organization.
26
26
27
27
> [ !TIP]
28
- > Click on the Wan2.1 models in the right sidebar for more examples of other video generation tasks .
28
+ > Click on the Wan2.1 models in the right sidebar for more examples of video generation.
29
29
30
30
The example below demonstrates how to generate a video from text optimized for memory or inference speed.
31
31
@@ -38,17 +38,16 @@ The Wan2.1 text-to-video model below requires ~13GB of VRAM.
38
38
39
39
``` py
40
40
# pip install ftfy
41
-
42
41
import torch
43
42
import numpy as np
44
- from diffusers import AutoencoderKLWan, WanTransformer3DModel , WanPipeline
43
+ from diffusers import AutoModel , WanPipeline
45
44
from diffusers.hooks.group_offloading import apply_group_offloading
46
45
from diffusers.utils import export_to_video, load_image
47
- from transformers import UMT5EncoderModel, CLIPVisionModel
46
+ from transformers import UMT5EncoderModel
48
47
49
48
text_encoder = UMT5EncoderModel.from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " text_encoder" , torch_dtype = torch.bfloat16)
50
- vae = AutoencoderKLWan .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
51
- transformer = WanTransformer3DModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
49
+ vae = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
50
+ transformer = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
52
51
53
52
# group-offloading
54
53
onload_device = torch.device(" cuda" )
@@ -67,7 +66,7 @@ transformer.enable_group_offload(
67
66
)
68
67
69
68
pipeline = WanPipeline.from_pretrained(
70
- model_id ,
69
+ " Wan-AI/Wan2.1-T2V-14B-Diffusers " ,
71
70
vae = vae,
72
71
transformer = transformer,
73
72
text_encoder = text_encoder,
@@ -104,20 +103,19 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
104
103
105
104
``` py
106
105
# pip install ftfy
107
-
108
106
import torch
109
107
import numpy as np
110
- from diffusers import AutoencoderKLWan, WanTransformer3DModel , WanPipeline
108
+ from diffusers import AutoModel , WanPipeline
111
109
from diffusers.hooks.group_offloading import apply_group_offloading
112
110
from diffusers.utils import export_to_video, load_image
113
- from transformers import UMT5EncoderModel, CLIPVisionModel
111
+ from transformers import UMT5EncoderModel
114
112
115
113
text_encoder = UMT5EncoderModel.from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " text_encoder" , torch_dtype = torch.bfloat16)
116
- vae = AutoencoderKLWan .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
117
- transformer = WanTransformer3DModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
114
+ vae = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32)
115
+ transformer = AutoModel .from_pretrained(" Wan-AI/Wan2.1-T2V-14B-Diffusers" , subfolder = " transformer" , torch_dtype = torch.bfloat16)
118
116
119
117
pipeline = WanPipeline.from_pretrained(
120
- model_id ,
118
+ " Wan-AI/Wan2.1-T2V-14B-Diffusers " ,
121
119
vae = vae,
122
120
transformer = transformer,
123
121
text_encoder = text_encoder,
@@ -162,20 +160,19 @@ export_to_video(output, "output.mp4", fps=16)
162
160
163
161
``` py
164
162
# pip install ftfy
165
-
166
163
import torch
167
- from diffusers import WanPipeline
164
+ from diffusers import AutoModel, WanPipeline
168
165
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
169
166
from diffusers.utils import export_to_video
170
167
171
- vae = AutoencoderKLWan .from_pretrained(
172
- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32
168
+ vae = AutoModel .from_pretrained(
169
+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , subfolder = " vae" , torch_dtype = torch.float32
173
170
)
174
171
pipeline = WanPipeline.from_pretrained(
175
- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , vae = vae, torch_dtype = torch.bfloat16
172
+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" , vae = vae, torch_dtype = torch.bfloat16
176
173
)
177
174
pipeline.scheduler = UniPCMultistepScheduler.from_config(
178
- pipeline.scheduler.config, flow_shift = 5.0
175
+ pipeline.scheduler.config, flow_shift = 5.0
179
176
)
180
177
pipeline.to(" cuda" )
181
178
@@ -194,9 +191,9 @@ export_to_video(output, "output.mp4", fps=16)
194
191
"""
195
192
196
193
output = pipeline(
197
- prompt = prompt,
198
- num_frames = 81 ,
199
- guidance_scale = 5.0 ,
194
+ prompt = prompt,
195
+ num_frames = 81 ,
196
+ guidance_scale = 5.0 ,
200
197
).frames[0 ]
201
198
export_to_video(output, " output.mp4" , fps = 16 )
202
199
```
@@ -205,30 +202,29 @@ export_to_video(output, "output.mp4", fps=16)
205
202
206
203
``` py
207
204
# pip install ftfy
208
-
209
205
import torch
210
- from diffusers import WanPipeline, WanTransformer3DModel, AutoencoderKLWan
206
+ from diffusers import WanPipeline, AutoModel
211
207
212
- vae = AutoencoderKLWan .from_single_file(
213
- " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
208
+ vae = AutoModel .from_single_file(
209
+ " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors"
214
210
)
215
- transformer = WanTransformer3DModel .from_single_file(
216
- " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors" ,
217
- torch_dtype = torch.bfloat16
211
+ transformer = AutoModel .from_single_file(
212
+ " https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/diffusion_models/wan2.1_t2v_1.3B_bf16.safetensors" ,
213
+ torch_dtype = torch.bfloat16
218
214
)
219
215
pipeline = WanPipeline.from_pretrained(
220
- " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" ,
221
- vae = vae,
222
- transformer = transformer,
223
- torch_dtype = torch.bfloat16
216
+ " Wan-AI/Wan2.1-T2V-1.3B-Diffusers" ,
217
+ vae = vae,
218
+ transformer = transformer,
219
+ torch_dtype = torch.bfloat16
224
220
)
225
221
```
226
222
227
223
- Set the [ ` AutoencoderKLWan ` ] dtype to ` torch.float32 ` for better decoding quality.
228
224
229
225
- The number of frames per second (fps) or ` k ` should be calculated by ` 4 * k + 1 ` .
230
226
231
- - Try lower ` shift ` values (` 2.0 ` to ` 5.0 ` ) for lower resolution videos, and try higher ` shift ` values (` 7.0 ` to ` 12.0 ` ) for higher resolution images.
227
+ - Try lower ` shift ` values (` 2.0 ` to ` 5.0 ` ) for lower resolution videos and higher ` shift ` values (` 7.0 ` to ` 12.0 ` ) for higher resolution images.
232
228
233
229
## WanPipeline
234
230
0 commit comments