Skip to content

Commit 2e5203b

Browse files
authored
Hunyuan I2V (#10983)
* update * update * update * add tests * update * add model tests * update docs * update * update example * fix defaults * update
1 parent d55f411 commit 2e5203b

File tree

10 files changed

+1426
-25
lines changed

10 files changed

+1426
-25
lines changed

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ The following models are available for the image-to-video pipeline:
4949

5050
| Model name | Description |
5151
|:---|:---|
52-
| [`https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
52+
| [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
53+
| [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
5354

5455
## Quantization
5556

scripts/convert_hunyuan_video_to_diffusers.py

Lines changed: 95 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,19 @@
33

44
import torch
55
from accelerate import init_empty_weights
6-
from transformers import AutoModel, AutoTokenizer, CLIPTextModel, CLIPTokenizer
6+
from transformers import (
7+
AutoModel,
8+
AutoTokenizer,
9+
CLIPImageProcessor,
10+
CLIPTextModel,
11+
CLIPTokenizer,
12+
LlavaForConditionalGeneration,
13+
)
714

815
from diffusers import (
916
AutoencoderKLHunyuanVideo,
1017
FlowMatchEulerDiscreteScheduler,
18+
HunyuanVideoImageToVideoPipeline,
1119
HunyuanVideoPipeline,
1220
HunyuanVideoTransformer3DModel,
1321
)
@@ -134,6 +142,46 @@ def remap_single_transformer_blocks_(key, state_dict):
134142
VAE_SPECIAL_KEYS_REMAP = {}
135143

136144

145+
TRANSFORMER_CONFIGS = {
146+
"HYVideo-T/2-cfgdistill": {
147+
"in_channels": 16,
148+
"out_channels": 16,
149+
"num_attention_heads": 24,
150+
"attention_head_dim": 128,
151+
"num_layers": 20,
152+
"num_single_layers": 40,
153+
"num_refiner_layers": 2,
154+
"mlp_ratio": 4.0,
155+
"patch_size": 2,
156+
"patch_size_t": 1,
157+
"qk_norm": "rms_norm",
158+
"guidance_embeds": True,
159+
"text_embed_dim": 4096,
160+
"pooled_projection_dim": 768,
161+
"rope_theta": 256.0,
162+
"rope_axes_dim": (16, 56, 56),
163+
},
164+
"HYVideo-T/2-I2V": {
165+
"in_channels": 16 * 2 + 1,
166+
"out_channels": 16,
167+
"num_attention_heads": 24,
168+
"attention_head_dim": 128,
169+
"num_layers": 20,
170+
"num_single_layers": 40,
171+
"num_refiner_layers": 2,
172+
"mlp_ratio": 4.0,
173+
"patch_size": 2,
174+
"patch_size_t": 1,
175+
"qk_norm": "rms_norm",
176+
"guidance_embeds": False,
177+
"text_embed_dim": 4096,
178+
"pooled_projection_dim": 768,
179+
"rope_theta": 256.0,
180+
"rope_axes_dim": (16, 56, 56),
181+
},
182+
}
183+
184+
137185
def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
138186
state_dict[new_key] = state_dict.pop(old_key)
139187

@@ -149,11 +197,12 @@ def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
149197
return state_dict
150198

151199

152-
def convert_transformer(ckpt_path: str):
200+
def convert_transformer(ckpt_path: str, transformer_type: str):
153201
original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
202+
config = TRANSFORMER_CONFIGS[transformer_type]
154203

155204
with init_empty_weights():
156-
transformer = HunyuanVideoTransformer3DModel()
205+
transformer = HunyuanVideoTransformer3DModel(**config)
157206

158207
for key in list(original_state_dict.keys()):
159208
new_key = key[:]
@@ -205,6 +254,10 @@ def get_args():
205254
parser.add_argument("--save_pipeline", action="store_true")
206255
parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
207256
parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
257+
parser.add_argument(
258+
"--transformer_type", type=str, default="HYVideo-T/2-cfgdistill", choices=list(TRANSFORMER_CONFIGS.keys())
259+
)
260+
parser.add_argument("--flow_shift", type=float, default=7.0)
208261
return parser.parse_args()
209262

210263

@@ -228,7 +281,7 @@ def get_args():
228281
assert args.text_encoder_2_path is not None
229282

230283
if args.transformer_ckpt_path is not None:
231-
transformer = convert_transformer(args.transformer_ckpt_path)
284+
transformer = convert_transformer(args.transformer_ckpt_path, args.transformer_type)
232285
transformer = transformer.to(dtype=dtype)
233286
if not args.save_pipeline:
234287
transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
@@ -239,19 +292,41 @@ def get_args():
239292
vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
240293

241294
if args.save_pipeline:
242-
text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16)
243-
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
244-
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
245-
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
246-
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
247-
248-
pipe = HunyuanVideoPipeline(
249-
transformer=transformer,
250-
vae=vae,
251-
text_encoder=text_encoder,
252-
tokenizer=tokenizer,
253-
text_encoder_2=text_encoder_2,
254-
tokenizer_2=tokenizer_2,
255-
scheduler=scheduler,
256-
)
257-
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
295+
if args.transformer_type == "HYVideo-T/2-cfgdistill":
296+
text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16)
297+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
298+
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
299+
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
300+
scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift)
301+
302+
pipe = HunyuanVideoPipeline(
303+
transformer=transformer,
304+
vae=vae,
305+
text_encoder=text_encoder,
306+
tokenizer=tokenizer,
307+
text_encoder_2=text_encoder_2,
308+
tokenizer_2=tokenizer_2,
309+
scheduler=scheduler,
310+
)
311+
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
312+
else:
313+
text_encoder = LlavaForConditionalGeneration.from_pretrained(
314+
args.text_encoder_path, torch_dtype=torch.float16
315+
)
316+
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
317+
text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
318+
tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
319+
scheduler = FlowMatchEulerDiscreteScheduler(shift=args.flow_shift)
320+
image_processor = CLIPImageProcessor.from_pretrained(args.text_encoder_path)
321+
322+
pipe = HunyuanVideoImageToVideoPipeline(
323+
transformer=transformer,
324+
vae=vae,
325+
text_encoder=text_encoder,
326+
tokenizer=tokenizer,
327+
text_encoder_2=text_encoder_2,
328+
tokenizer_2=tokenizer_2,
329+
scheduler=scheduler,
330+
image_processor=image_processor,
331+
)
332+
pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")

src/diffusers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@
313313
"HunyuanDiTPAGPipeline",
314314
"HunyuanDiTPipeline",
315315
"HunyuanSkyreelsImageToVideoPipeline",
316+
"HunyuanVideoImageToVideoPipeline",
316317
"HunyuanVideoPipeline",
317318
"I2VGenXLPipeline",
318319
"IFImg2ImgPipeline",
@@ -823,6 +824,7 @@
823824
HunyuanDiTPAGPipeline,
824825
HunyuanDiTPipeline,
825826
HunyuanSkyreelsImageToVideoPipeline,
827+
HunyuanVideoImageToVideoPipeline,
826828
HunyuanVideoPipeline,
827829
I2VGenXLPipeline,
828830
IFImg2ImgPipeline,

src/diffusers/models/transformers/transformer_hunyuan_video.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,11 @@ def __init__(
581581
self.context_embedder = HunyuanVideoTokenRefiner(
582582
text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
583583
)
584-
self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
584+
585+
if guidance_embeds:
586+
self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
587+
else:
588+
self.time_text_embed = CombinedTimestepTextProjEmbeddings(inner_dim, pooled_projection_dim)
585589

586590
# 2. RoPE
587591
self.rope = HunyuanVideoRotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
@@ -708,7 +712,11 @@ def forward(
708712
image_rotary_emb = self.rope(hidden_states)
709713

710714
# 2. Conditional embeddings
711-
temb = self.time_text_embed(timestep, guidance, pooled_projections)
715+
if self.config.guidance_embeds:
716+
temb = self.time_text_embed(timestep, guidance, pooled_projections)
717+
else:
718+
temb = self.time_text_embed(timestep, pooled_projections)
719+
712720
hidden_states = self.x_embedder(hidden_states)
713721
encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
714722

src/diffusers/pipelines/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@
222222
"EasyAnimateControlPipeline",
223223
]
224224
_import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
225-
_import_structure["hunyuan_video"] = ["HunyuanVideoPipeline", "HunyuanSkyreelsImageToVideoPipeline"]
225+
_import_structure["hunyuan_video"] = [
226+
"HunyuanVideoPipeline",
227+
"HunyuanSkyreelsImageToVideoPipeline",
228+
"HunyuanVideoImageToVideoPipeline",
229+
]
226230
_import_structure["kandinsky"] = [
227231
"KandinskyCombinedPipeline",
228232
"KandinskyImg2ImgCombinedPipeline",
@@ -570,7 +574,11 @@
570574
FluxPriorReduxPipeline,
571575
ReduxImageEncoder,
572576
)
573-
from .hunyuan_video import HunyuanSkyreelsImageToVideoPipeline, HunyuanVideoPipeline
577+
from .hunyuan_video import (
578+
HunyuanSkyreelsImageToVideoPipeline,
579+
HunyuanVideoImageToVideoPipeline,
580+
HunyuanVideoPipeline,
581+
)
574582
from .hunyuandit import HunyuanDiTPipeline
575583
from .i2vgen_xl import I2VGenXLPipeline
576584
from .kandinsky import (

src/diffusers/pipelines/hunyuan_video/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
else:
2525
_import_structure["pipeline_hunyuan_skyreels_image2video"] = ["HunyuanSkyreelsImageToVideoPipeline"]
2626
_import_structure["pipeline_hunyuan_video"] = ["HunyuanVideoPipeline"]
27+
_import_structure["pipeline_hunyuan_video_image2video"] = ["HunyuanVideoImageToVideoPipeline"]
2728

2829
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
2930
try:
@@ -35,6 +36,7 @@
3536
else:
3637
from .pipeline_hunyuan_skyreels_image2video import HunyuanSkyreelsImageToVideoPipeline
3738
from .pipeline_hunyuan_video import HunyuanVideoPipeline
39+
from .pipeline_hunyuan_video_image2video import HunyuanVideoImageToVideoPipeline
3840

3941
else:
4042
import sys

0 commit comments

Comments
 (0)