Skip to content

Commit ed7d298

Browse files
committed
pipeline-level quant config
1 parent 75e6a50 commit ed7d298

File tree

4 files changed

+63
-59
lines changed

4 files changed

+63
-59
lines changed

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,16 @@ The quantized CogVideoX 5B model below requires ~16GB of VRAM.
3939

4040
```py
4141
import torch
42-
from diffusers import CogVideoXPipeline, AutoModel, TorchAoConfig
42+
from diffusers import CogVideoXPipeline, AutoModel
43+
from diffusers.quantizers import PipelineQuantizationConfig
4344
from diffusers.hooks import apply_group_offloading
4445
from diffusers.utils import export_to_video
4546

4647
# quantize weights to int8 with torchao
47-
quantization_config = TorchAoConfig("int8wo")
48-
transformer = AutoModel.from_pretrained(
49-
"THUDM/CogVideoX-5b",
50-
subfolder="transformer",
51-
quantization_config=quantization_config,
52-
torch_dtype=torch.bfloat16,
48+
pipeline_quant_config = PipelineQuantizationConfig(
49+
quant_backend="torchao",
50+
quant_kwargs={"quant_type": "int8wo"},
51+
components_to_quantize=["transformer"]
5352
)
5453

5554
# fp8 layerwise weight-casting
@@ -65,6 +64,7 @@ transformer.enable_layerwise_casting(
6564
pipeline = CogVideoXPipeline.from_pretrained(
6665
"THUDM/CogVideoX-5b",
6766
transformer=transformer,
67+
quantization_config=pipeline_quant_config,
6868
torch_dtype=torch.bfloat16
6969
)
7070
pipeline.to("cuda")

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,25 @@ The quantized HunyuanVideo model below requires ~14GB of VRAM.
4040

4141
```py
4242
import torch
43-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
43+
from diffusers import AutoModel, HunyuanVideoPipeline
44+
from diffusers.quantizers import PipelineQuantizationConfig
4445
from diffusers.utils import export_to_video
4546

4647
# quantize weights to int4 with bitsandbytes
47-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
48-
transformer = AutoModel.from_pretrained(
49-
"hunyuanvideo-community/HunyuanVideo",
50-
subfolder="transformer",
51-
quantization_config=quant_config,
52-
torch_dtype=torch.bfloat16,
48+
pipeline_quant_config = PipelineQuantizationConfig(
49+
quant_backend="bitsandbytes_4bit",
50+
quant_kwargs={
51+
"load_in_4bit": True,
52+
"bnb_4bit_quant_type": "nf4",
53+
"bnb_4bit_compute_dtype": torch.bfloat16
54+
},
55+
components_to_quantize=["transformer"]
5356
)
5457

5558
pipeline = HunyuanVideoPipeline.from_pretrained(
5659
"hunyuanvideo-community/HunyuanVideo",
57-
transformer=transformer,
58-
torch_dtype=torch.float16,
60+
quantization_config=pipeline_quant_config,
61+
torch_dtype=torch.bfloat16,
5962
)
6063

6164
# model-offloading and tiling
@@ -74,22 +77,25 @@ Compilation is slow the first time but subsequent calls to the pipeline are fast
7477

7578
```py
7679
import torch
77-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
80+
from diffusers import AutoModel, HunyuanVideoPipeline
81+
from diffusers.quantizers import PipelineQuantizationConfig
7882
from diffusers.utils import export_to_video
7983

8084
# quantize weights to int4 with bitsandbytes
81-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
82-
transformer = AutoModel.from_pretrained(
83-
"hunyuanvideo-community/HunyuanVideo",
84-
subfolder="transformer",
85-
quantization_config=quant_config,
86-
torch_dtype=torch.bfloat16,
85+
pipeline_quant_config = PipelineQuantizationConfig(
86+
quant_backend="bitsandbytes_4bit",
87+
quant_kwargs={
88+
"load_in_4bit": True,
89+
"bnb_4bit_quant_type": "nf4",
90+
"bnb_4bit_compute_dtype": torch.bfloat16
91+
},
92+
components_to_quantize=["transformer"]
8793
)
8894

8995
pipeline = HunyuanVideoPipeline.from_pretrained(
9096
"hunyuanvideo-community/HunyuanVideo",
91-
transformer=transformer,
92-
torch_dtype=torch.float16,
97+
quantization_config=pipeline_quant_config,
98+
torch_dtype=torch.bfloat16,
9399
)
94100

95101
# model-offloading and tiling
@@ -116,22 +122,25 @@ export_to_video(video, "output.mp4", fps=15)
116122

117123
```py
118124
import torch
119-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
125+
from diffusers import AutoModel, HunyuanVideoPipeline
126+
from diffusers.quantizers import PipelineQuantizationConfig
120127
from diffusers.utils import export_to_video
121128

122129
# quantize weights to int4 with bitsandbytes
123-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
124-
transformer = AutoModel.from_pretrained(
125-
"hunyuanvideo-community/HunyuanVideo",
126-
subfolder="transformer",
127-
quantization_config=quant_config,
128-
torch_dtype=torch.bfloat16,
130+
pipeline_quant_config = PipelineQuantizationConfig(
131+
quant_backend="bitsandbytes_4bit",
132+
quant_kwargs={
133+
"load_in_4bit": True,
134+
"bnb_4bit_quant_type": "nf4",
135+
"bnb_4bit_compute_dtype": torch.bfloat16
136+
},
137+
components_to_quantize=["transformer"]
129138
)
130139

131140
pipeline = HunyuanVideoPipeline.from_pretrained(
132141
"hunyuanvideo-community/HunyuanVideo",
133-
transformer=transformer,
134-
torch_dtype=torch.float16,
142+
quantization_config=pipeline_quant_config,
143+
torch_dtype=torch.bfloat16,
135144
)
136145

137146
# load LoRA weights

docs/source/en/api/pipelines/wan.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ The Wan2.1 text-to-video model below requires ~13GB of VRAM.
4141
import torch
4242
import numpy as np
4343
from diffusers import AutoModel, WanPipeline
44+
from diffusers.quantizers import PipelineQuantizationConfig
4445
from diffusers.hooks.group_offloading import apply_group_offloading
4546
from diffusers.utils import export_to_video, load_image
4647
from transformers import UMT5EncoderModel

docs/source/en/using-diffusers/text-img2vid.md

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,25 @@ export_to_video(output, "output.mp4", fps=16)
8686

8787
```py
8888
import torch
89-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, HunyuanVideoPipeline
89+
from diffusers importAutoModel, HunyuanVideoPipeline
90+
from diffusers.quantizers import PipelineQuantizationConfig
9091
from diffusers.utils import export_to_video
9192

9293
# quantize weights to int4 with bitsandbytes
93-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
94-
transformer = AutoModel.from_pretrained(
95-
"hunyuanvideo-community/HunyuanVideo",
96-
subfolder="transformer",
97-
quantization_config=quant_config,
98-
torch_dtype=torch.bfloat16,
94+
pipeline_quant_config = PipelineQuantizationConfig(
95+
quant_backend="bitsandbytes_4bit",
96+
quant_kwargs={
97+
"load_in_4bit": True,
98+
"bnb_4bit_quant_type": "nf4",
99+
"bnb_4bit_compute_dtype": torch.bfloat16
100+
},
101+
components_to_quantize=["transformer"]
99102
)
100103

101104
pipeline = HunyuanVideoPipeline.from_pretrained(
102105
"hunyuanvideo-community/HunyuanVideo",
103-
transformer=transformer,
104-
torch_dtype=torch.float16,
106+
quantization_config=pipeline_quant_config,
107+
torch_dtype=torch.bfloat16,
105108
)
106109

107110
# model-offloading and tiling
@@ -360,33 +363,24 @@ The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize
360363

361364
import torch
362365
from diffusers import WanPipeline
363-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, AutoModel, WanPipeline
366+
from diffusers import AutoModel, WanPipeline
367+
from diffusers.quantizers import PipelineQuantizationConfig
364368
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
365369
from transformers import UMT5EncoderModel
366370
from diffusers.utils import export_to_video
367371

368372
# quantize transformer and text encoder weights with bitsandbytes
369-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
370-
transformer = AutoModel.from_pretrained(
371-
"Wan-AI/Wan2.1-T2V-14B-Diffusers",
372-
subfolder="transformer",
373-
quantization_config=quant_config,
374-
torch_dtype=torch.bfloat16,
375-
)
376-
377-
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True)
378-
text_encoder = UMT5EncoderModel.from_pretrained(
379-
"Wan-AI/Wan2.1-T2V-14B-Diffusers",
380-
subfolder="text_encoder",
381-
quantization_config=quant_config,
382-
torch_dtype=torch.bfloat16,
373+
pipeline_quant_config = PipelineQuantizationConfig(
374+
quant_backend="bitsandbytes_4bit",
375+
quant_kwargs={"load_in_4bit": True},
376+
components_to_quantize=["transformer", "text_encoder"]
383377
)
384378

385379
vae = AutoModel.from_pretrained(
386380
"Wan-AI/Wan2.1-T2V-14B-Diffusers", subfolder="vae", torch_dtype=torch.float32
387381
)
388382
pipeline = WanPipeline.from_pretrained(
389-
"Wan-AI/Wan2.1-T2V-14B-Diffusers", transformer=transformer, text_encoder=text_encoder, vae=vae, torch_dtype=torch.bfloat16
383+
"Wan-AI/Wan2.1-T2V-14B-Diffusers", vae=vae, quantization_config=pipeline_quant_config, torch_dtype=torch.bfloat16
390384
)
391385
pipeline.scheduler = UniPCMultistepScheduler.from_config(
392386
pipeline.scheduler.config, flow_shift=5.0

0 commit comments

Comments
 (0)