From a81d3b8b004520c38161cb8c28e031d21a6f4ec8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 10 Mar 2025 14:52:53 +0530 Subject: [PATCH 1/3] memory usage tests --- tests/quantization/__init__.py | 0 tests/quantization/bnb/test_4bit.py | 46 +++++++++++----------- tests/quantization/bnb/test_mixed_int8.py | 43 ++++++++++---------- tests/quantization/quanto/__init__.py | 0 tests/quantization/quanto/test_quanto.py | 40 ++++--------------- tests/quantization/torchao/__init__.py | 0 tests/quantization/torchao/test_torchao.py | 33 ++++++---------- tests/quantization/utils.py | 39 ++++++++++++++++++ 8 files changed, 102 insertions(+), 99 deletions(-) create mode 100644 tests/quantization/__init__.py create mode 100644 tests/quantization/quanto/__init__.py create mode 100644 tests/quantization/torchao/__init__.py create mode 100644 tests/quantization/utils.py diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 6f85e6f38955..809c356c441a 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -54,29 +54,8 @@ def get_some_linear_layer(model): if is_torch_available(): import torch - import torch.nn as nn - class LoRALayer(nn.Module): - """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only - - Taken from - https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77 - """ - - def __init__(self, module: nn.Module, rank: int): - super().__init__() - self.module = module - self.adapter = nn.Sequential( - nn.Linear(module.in_features, rank, bias=False), - nn.Linear(rank, module.out_features, bias=False), - ) - small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 - nn.init.normal_(self.adapter[0].weight, std=small_std) - nn.init.zeros_(self.adapter[1].weight) - self.adapter.to(module.weight.device) - - def forward(self, input, *args, **kwargs): - return self.module(input, *args, **kwargs) + self.adapter(input) + from ..utils import LoRALayer, get_memory_consumption_stat if is_bitsandbytes_available(): @@ -350,6 +329,29 @@ def test_bnb_4bit_errors_loading_incorrect_state_dict(self): assert key_to_target in str(err_context.exception) + def test_model_memory_usage(self): + # Delete to not let anything interfere. + del self.model_4bit, self.model_fp16 + + # Re-instantiate. + inputs = self.get_dummy_inputs() + model_fp16 = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", torch_dtype=torch.float16 + ) + unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) + nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + model_4bit = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", quantization_config=nf4_config, device_map=torch_device + ) + quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs) + print(f"{unquantized_model_memory=}, {quantized_model_memory=}") + assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. + + class BnB4BitTrainingTests(Base4bitTests): def setUp(self): diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 4be420e7dffa..b8b4b1598eb7 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -60,29 +60,8 @@ def get_some_linear_layer(model): if is_torch_available(): import torch - import torch.nn as nn - class LoRALayer(nn.Module): - """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only - - Taken from - https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_8bit.py#L62C5-L78C77 - """ - - def __init__(self, module: nn.Module, rank: int): - super().__init__() - self.module = module - self.adapter = nn.Sequential( - nn.Linear(module.in_features, rank, bias=False), - nn.Linear(rank, module.out_features, bias=False), - ) - small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 - nn.init.normal_(self.adapter[0].weight, std=small_std) - nn.init.zeros_(self.adapter[1].weight) - self.adapter.to(module.weight.device) - - def forward(self, input, *args, **kwargs): - return self.module(input, *args, **kwargs) + self.adapter(input) + from ..utils import LoRALayer, get_memory_consumption_stat if is_bitsandbytes_available(): @@ -248,7 +227,7 @@ def test_llm_skip(self): self.assertTrue(linear.weight.dtype == torch.int8) self.assertTrue(isinstance(linear, bnb.nn.Linear8bitLt)) - self.assertTrue(isinstance(model_8bit.proj_out, nn.Linear)) + self.assertTrue(isinstance(model_8bit.proj_out, torch.nn.Linear)) self.assertTrue(model_8bit.proj_out.weight.dtype != torch.int8) def test_config_from_pretrained(self): @@ -308,6 +287,24 @@ def test_device_and_dtype_assignment(self): # Check that this does not throw an error _ = self.model_fp16.cuda() + def test_model_memory_usage(self): + # Delete to not let anything interfere. + del self.model_4bit, self.model_fp16 + + # Re-instantiate. + inputs = self.get_dummy_inputs() + model_fp16 = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", torch_dtype=torch.float16 + ) + unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) + config = BitsAndBytesConfig(load_in_8bit=True) + model_8bit = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", quantization_config=config, device_map=torch_device + ) + quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs) + print(f"{unquantized_model_memory=}, {quantized_model_memory=}") + assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. + class Bnb8bitDeviceTests(Base8bitTests): def setUp(self) -> None: diff --git a/tests/quantization/quanto/__init__.py b/tests/quantization/quanto/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 89a56c15ed24..7072ddbcad10 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -19,29 +19,8 @@ if is_torch_available(): import torch - import torch.nn as nn - - class LoRALayer(nn.Module): - """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only - - Taken from - https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77 - """ - - def __init__(self, module: nn.Module, rank: int): - super().__init__() - self.module = module - self.adapter = nn.Sequential( - nn.Linear(module.in_features, rank, bias=False), - nn.Linear(rank, module.out_features, bias=False), - ) - small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 - nn.init.normal_(self.adapter[0].weight, std=small_std) - nn.init.zeros_(self.adapter[1].weight) - self.adapter.to(module.weight.device) - - def forward(self, input, *args, **kwargs): - return self.module(input, *args, **kwargs) + self.adapter(input) + + from ..utils import LoRALayer, get_memory_consumption_stat @nightly @@ -86,19 +65,14 @@ def test_quanto_layers(self): def test_quanto_memory_usage(self): unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype) - unquantized_model_memory = unquantized_model.get_memory_footprint() / 1024**3 - - model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) inputs = self.get_dummy_inputs() + unquantized_model_memory = get_memory_consumption_stat(unquantized_model, inputs) - torch.cuda.reset_peak_memory_stats() - torch.cuda.empty_cache() + quantized_model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs) - model.to(torch_device) - with torch.no_grad(): - model(**inputs) - max_memory = torch.cuda.max_memory_allocated() / 1024**3 - assert (1.0 - (max_memory / unquantized_model_memory)) >= self.expected_memory_reduction + print(f"{unquantized_model_memory=}, {quantized_model_memory=}") + assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= self.expected_memory_reduction def test_keep_modules_in_fp32(self): r""" diff --git a/tests/quantization/torchao/__init__.py b/tests/quantization/torchao/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index e14a1cc0369e..72a38ce9e7c3 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -50,27 +50,7 @@ import torch import torch.nn as nn - class LoRALayer(nn.Module): - """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only - - Taken from - https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77 - """ - - def __init__(self, module: nn.Module, rank: int): - super().__init__() - self.module = module - self.adapter = nn.Sequential( - nn.Linear(module.in_features, rank, bias=False), - nn.Linear(rank, module.out_features, bias=False), - ) - small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 - nn.init.normal_(self.adapter[0].weight, std=small_std) - nn.init.zeros_(self.adapter[1].weight) - self.adapter.to(module.weight.device) - - def forward(self, input, *args, **kwargs): - return self.module(input, *args, **kwargs) + self.adapter(input) + from ..utils import LoRALayer, get_memory_consumption_stat if is_torchao_available(): @@ -503,6 +483,17 @@ def test_memory_footprint(self): # there is additional overhead of scales and zero points self.assertTrue(total_bf16 < total_int4wo) + def test_memory_usage(self): + model_id = "hf-internal-testing/tiny-flux-pipe" + inputs = self.get_dummy_inputs() + transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"] + unquantized_model_memory = get_memory_consumption_stat(transformer_bf16, inputs) + + transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"), model_id=model_id)["transformer"] + quantized_model_memory = get_memory_consumption_stat(transformer_int8wo, inputs) + print(f"{unquantized_model_memory=}, {quantized_model_memory=}") + assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. + def test_wrong_config(self): with self.assertRaises(ValueError): self.get_dummy_components(TorchAoConfig("int42")) diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py new file mode 100644 index 000000000000..77fbf6463d8d --- /dev/null +++ b/tests/quantization/utils.py @@ -0,0 +1,39 @@ +from diffusers.utils import is_torch_available + + +if is_torch_available(): + import torch + import torch.nn as nn + + class LoRALayer(nn.Module): + """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only + + Taken from + https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77 + """ + + def __init__(self, module: nn.Module, rank: int): + super().__init__() + self.module = module + self.adapter = nn.Sequential( + nn.Linear(module.in_features, rank, bias=False), + nn.Linear(rank, module.out_features, bias=False), + ) + small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 + nn.init.normal_(self.adapter[0].weight, std=small_std) + nn.init.zeros_(self.adapter[1].weight) + self.adapter.to(module.weight.device) + + def forward(self, input, *args, **kwargs): + return self.module(input, *args, **kwargs) + self.adapter(input) + + + @torch.no_grad() + @torch.inference_mode() + def get_memory_consumption_stat(model, inputs): + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + + model(**inputs) + max_memory_mem_allocated = torch.cuda.max_memory_allocated() + return max_memory_mem_allocated From a1b70d1c8b34cc548fca4bc23044ed56a4d52e0b Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 10 Mar 2025 18:00:47 +0530 Subject: [PATCH 2/3] fixes --- .../quantizers/bitsandbytes/bnb_quantizer.py | 2 + .../quantizers/torchao/torchao_quantizer.py | 1 + tests/quantization/bnb/test_4bit.py | 57 +++++++++++-------- tests/quantization/bnb/test_mixed_int8.py | 48 +++++++++------- tests/quantization/quanto/test_quanto.py | 17 ++++-- tests/quantization/torchao/test_torchao.py | 13 +++-- tests/quantization/utils.py | 3 +- 7 files changed, 84 insertions(+), 57 deletions(-) diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py index ada75588a42a..f4aa1504534c 100644 --- a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py +++ b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py @@ -135,6 +135,7 @@ def create_quantized_param( target_device: "torch.device", state_dict: Dict[str, Any], unexpected_keys: Optional[List[str]] = None, + **kwargs, ): import bitsandbytes as bnb @@ -445,6 +446,7 @@ def create_quantized_param( target_device: "torch.device", state_dict: Dict[str, Any], unexpected_keys: Optional[List[str]] = None, + **kwargs, ): import bitsandbytes as bnb diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py index e86ce2f64278..03cb29c6f037 100644 --- a/src/diffusers/quantizers/torchao/torchao_quantizer.py +++ b/src/diffusers/quantizers/torchao/torchao_quantizer.py @@ -215,6 +215,7 @@ def create_quantized_param( target_device: "torch.device", state_dict: Dict[str, Any], unexpected_keys: List[str], + **kwargs, ): r""" Each nn.Linear layer that needs to be quantized is processsed here. First, we set the value the weight tensor, diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 809c356c441a..97047717cd83 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -75,6 +75,8 @@ class Base4bitTests(unittest.TestCase): # This was obtained on audace so the number might slightly change expected_rel_difference = 3.69 + expected_memory_saving_ratio = 0.8 + prompt = "a beautiful sunset amidst the mountains." num_inference_steps = 10 seed = 0 @@ -119,8 +121,10 @@ def setUp(self): ) def tearDown(self): - del self.model_fp16 - del self.model_4bit + if hasattr(self, "model_fp16"): + del self.model_fp16 + if hasattr(self, "model_4bit"): + del self.model_4bit gc.collect() torch.cuda.empty_cache() @@ -159,6 +163,32 @@ def test_memory_footprint(self): linear = get_some_linear_layer(self.model_4bit) self.assertTrue(linear.weight.__class__ == bnb.nn.Params4bit) + def test_model_memory_usage(self): + # Delete to not let anything interfere. + del self.model_4bit, self.model_fp16 + + # Re-instantiate. + inputs = self.get_dummy_inputs() + inputs = { + k: v.to(device=torch_device, dtype=torch.float16) for k, v in inputs.items() if not isinstance(v, bool) + } + model_fp16 = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", torch_dtype=torch.float16 + ).to(torch_device) + unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) + del model_fp16 + + nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + model_4bit = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", quantization_config=nf4_config, torch_dtype=torch.float16 + ) + quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs) + assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_saving_ratio + def test_original_dtype(self): r""" A simple test to check if the model succesfully stores the original dtype @@ -329,29 +359,6 @@ def test_bnb_4bit_errors_loading_incorrect_state_dict(self): assert key_to_target in str(err_context.exception) - def test_model_memory_usage(self): - # Delete to not let anything interfere. - del self.model_4bit, self.model_fp16 - - # Re-instantiate. - inputs = self.get_dummy_inputs() - model_fp16 = SD3Transformer2DModel.from_pretrained( - self.model_name, subfolder="transformer", torch_dtype=torch.float16 - ) - unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) - nf4_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - ) - model_4bit = SD3Transformer2DModel.from_pretrained( - self.model_name, subfolder="transformer", quantization_config=nf4_config, device_map=torch_device - ) - quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs) - print(f"{unquantized_model_memory=}, {quantized_model_memory=}") - assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. - - class BnB4BitTrainingTests(Base4bitTests): def setUp(self): diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index b8b4b1598eb7..4964f8c9af07 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -81,6 +81,8 @@ class Base8bitTests(unittest.TestCase): # This was obtained on audace so the number might slightly change expected_rel_difference = 1.94 + expected_memory_saving_ratio = 0.7 + prompt = "a beautiful sunset amidst the mountains." num_inference_steps = 10 seed = 0 @@ -121,8 +123,10 @@ def setUp(self): ) def tearDown(self): - del self.model_fp16 - del self.model_8bit + if hasattr(self, "model_fp16"): + del self.model_fp16 + if hasattr(self, "model_8bit"): + del self.model_8bit gc.collect() torch.cuda.empty_cache() @@ -161,6 +165,28 @@ def test_memory_footprint(self): linear = get_some_linear_layer(self.model_8bit) self.assertTrue(linear.weight.__class__ == bnb.nn.Int8Params) + def test_model_memory_usage(self): + # Delete to not let anything interfere. + del self.model_8bit, self.model_fp16 + + # Re-instantiate. + inputs = self.get_dummy_inputs() + inputs = { + k: v.to(device=torch_device, dtype=torch.float16) for k, v in inputs.items() if not isinstance(v, bool) + } + model_fp16 = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", torch_dtype=torch.float16 + ).to(torch_device) + unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) + del model_fp16 + + config = BitsAndBytesConfig(load_in_8bit=True) + model_8bit = SD3Transformer2DModel.from_pretrained( + self.model_name, subfolder="transformer", quantization_config=config, torch_dtype=torch.float16 + ) + quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs) + assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_saving_ratio + def test_original_dtype(self): r""" A simple test to check if the model succesfully stores the original dtype @@ -287,24 +313,6 @@ def test_device_and_dtype_assignment(self): # Check that this does not throw an error _ = self.model_fp16.cuda() - def test_model_memory_usage(self): - # Delete to not let anything interfere. - del self.model_4bit, self.model_fp16 - - # Re-instantiate. - inputs = self.get_dummy_inputs() - model_fp16 = SD3Transformer2DModel.from_pretrained( - self.model_name, subfolder="transformer", torch_dtype=torch.float16 - ) - unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs) - config = BitsAndBytesConfig(load_in_8bit=True) - model_8bit = SD3Transformer2DModel.from_pretrained( - self.model_name, subfolder="transformer", quantization_config=config, device_map=torch_device - ) - quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs) - print(f"{unquantized_model_memory=}, {quantized_model_memory=}") - assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. - class Bnb8bitDeviceTests(Base8bitTests): def setUp(self) -> None: diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 7072ddbcad10..51ca0bfdc0ab 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -19,7 +19,7 @@ if is_torch_available(): import torch - + from ..utils import LoRALayer, get_memory_consumption_stat @@ -64,15 +64,20 @@ def test_quanto_layers(self): assert isinstance(module, QLinear) def test_quanto_memory_usage(self): - unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype) inputs = self.get_dummy_inputs() + inputs = { + k: v.to(device=torch_device, dtype=torch.bfloat16) for k, v in inputs.items() if not isinstance(v, bool) + } + + unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype) + unquantized_model.to(torch_device) unquantized_model_memory = get_memory_consumption_stat(unquantized_model, inputs) quantized_model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + quantized_model.to(torch_device) quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs) - print(f"{unquantized_model_memory=}, {quantized_model_memory=}") - assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= self.expected_memory_reduction + assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_reduction def test_keep_modules_in_fp32(self): r""" @@ -292,14 +297,14 @@ def test_training(self): class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_reduction = 0.3 + expected_memory_reduction = 0.6 def get_dummy_init_kwargs(self): return {"weights_dtype": "float8"} class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_reduction = 0.3 + expected_memory_reduction = 0.6 _test_torch_compile = True def get_dummy_init_kwargs(self): diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 72a38ce9e7c3..0e671307dd18 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -483,16 +483,21 @@ def test_memory_footprint(self): # there is additional overhead of scales and zero points self.assertTrue(total_bf16 < total_int4wo) - def test_memory_usage(self): + def test_model_memory_usage(self): model_id = "hf-internal-testing/tiny-flux-pipe" - inputs = self.get_dummy_inputs() + expected_memory_saving_ratio = 2.0 + + inputs = self.get_dummy_tensor_inputs(device=torch_device) + transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"] + transformer_bf16.to(torch_device) unquantized_model_memory = get_memory_consumption_stat(transformer_bf16, inputs) + del transformer_bf16 transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"), model_id=model_id)["transformer"] + transformer_int8wo.to(torch_device) quantized_model_memory = get_memory_consumption_stat(transformer_int8wo, inputs) - print(f"{unquantized_model_memory=}, {quantized_model_memory=}") - assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100. + assert unquantized_model_memory / quantized_model_memory >= expected_memory_saving_ratio def test_wrong_config(self): with self.assertRaises(ValueError): diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 77fbf6463d8d..04ebf9e159f4 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -26,8 +26,7 @@ def __init__(self, module: nn.Module, rank: int): def forward(self, input, *args, **kwargs): return self.module(input, *args, **kwargs) + self.adapter(input) - - + @torch.no_grad() @torch.inference_mode() def get_memory_consumption_stat(model, inputs): From c0aac70ee1411224eafa968c6ee531b0964543c2 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 10 Mar 2025 18:04:39 +0530 Subject: [PATCH 3/3] gguf --- src/diffusers/quantizers/gguf/gguf_quantizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py index 0c760e277ce4..6da69c7bd60c 100644 --- a/src/diffusers/quantizers/gguf/gguf_quantizer.py +++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py @@ -108,6 +108,7 @@ def create_quantized_param( target_device: "torch.device", state_dict: Optional[Dict[str, Any]] = None, unexpected_keys: Optional[List[str]] = None, + **kwargs, ): module, tensor_name = get_module_from_name(model, param_name) if tensor_name not in module._parameters and tensor_name not in module._buffers: