From a81d3b8b004520c38161cb8c28e031d21a6f4ec8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 10 Mar 2025 14:52:53 +0530
Subject: [PATCH 1/3] memory usage tests

---
 tests/quantization/__init__.py             |  0
 tests/quantization/bnb/test_4bit.py        | 46 +++++++++++-----------
 tests/quantization/bnb/test_mixed_int8.py  | 43 ++++++++++----------
 tests/quantization/quanto/__init__.py      |  0
 tests/quantization/quanto/test_quanto.py   | 40 ++++---------------
 tests/quantization/torchao/__init__.py     |  0
 tests/quantization/torchao/test_torchao.py | 33 ++++++----------
 tests/quantization/utils.py                | 39 ++++++++++++++++++
 8 files changed, 102 insertions(+), 99 deletions(-)
 create mode 100644 tests/quantization/__init__.py
 create mode 100644 tests/quantization/quanto/__init__.py
 create mode 100644 tests/quantization/torchao/__init__.py
 create mode 100644 tests/quantization/utils.py

diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 6f85e6f38955..809c356c441a 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -54,29 +54,8 @@ def get_some_linear_layer(model):
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
 
-    class LoRALayer(nn.Module):
-        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only
-
-        Taken from
-        https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77
-        """
-
-        def __init__(self, module: nn.Module, rank: int):
-            super().__init__()
-            self.module = module
-            self.adapter = nn.Sequential(
-                nn.Linear(module.in_features, rank, bias=False),
-                nn.Linear(rank, module.out_features, bias=False),
-            )
-            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
-            nn.init.normal_(self.adapter[0].weight, std=small_std)
-            nn.init.zeros_(self.adapter[1].weight)
-            self.adapter.to(module.weight.device)
-
-        def forward(self, input, *args, **kwargs):
-            return self.module(input, *args, **kwargs) + self.adapter(input)
+    from ..utils import LoRALayer, get_memory_consumption_stat
 
 
 if is_bitsandbytes_available():
@@ -350,6 +329,29 @@ def test_bnb_4bit_errors_loading_incorrect_state_dict(self):
 
             assert key_to_target in str(err_context.exception)
 
+    def test_model_memory_usage(self):
+        # Delete to not let anything interfere.
+        del self.model_4bit, self.model_fp16
+        
+        # Re-instantiate.
+        inputs = self.get_dummy_inputs()
+        model_fp16 = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", torch_dtype=torch.float16
+        )
+        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
+        nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        model_4bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", quantization_config=nf4_config, device_map=torch_device
+        )
+        quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs)
+        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
+        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
+
+
 
 class BnB4BitTrainingTests(Base4bitTests):
     def setUp(self):
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 4be420e7dffa..b8b4b1598eb7 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -60,29 +60,8 @@ def get_some_linear_layer(model):
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
 
-    class LoRALayer(nn.Module):
-        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only
-
-        Taken from
-        https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_8bit.py#L62C5-L78C77
-        """
-
-        def __init__(self, module: nn.Module, rank: int):
-            super().__init__()
-            self.module = module
-            self.adapter = nn.Sequential(
-                nn.Linear(module.in_features, rank, bias=False),
-                nn.Linear(rank, module.out_features, bias=False),
-            )
-            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
-            nn.init.normal_(self.adapter[0].weight, std=small_std)
-            nn.init.zeros_(self.adapter[1].weight)
-            self.adapter.to(module.weight.device)
-
-        def forward(self, input, *args, **kwargs):
-            return self.module(input, *args, **kwargs) + self.adapter(input)
+    from ..utils import LoRALayer, get_memory_consumption_stat
 
 
 if is_bitsandbytes_available():
@@ -248,7 +227,7 @@ def test_llm_skip(self):
         self.assertTrue(linear.weight.dtype == torch.int8)
         self.assertTrue(isinstance(linear, bnb.nn.Linear8bitLt))
 
-        self.assertTrue(isinstance(model_8bit.proj_out, nn.Linear))
+        self.assertTrue(isinstance(model_8bit.proj_out, torch.nn.Linear))
         self.assertTrue(model_8bit.proj_out.weight.dtype != torch.int8)
 
     def test_config_from_pretrained(self):
@@ -308,6 +287,24 @@ def test_device_and_dtype_assignment(self):
         # Check that this does not throw an error
         _ = self.model_fp16.cuda()
 
+    def test_model_memory_usage(self):
+        # Delete to not let anything interfere.
+        del self.model_4bit, self.model_fp16
+        
+        # Re-instantiate.
+        inputs = self.get_dummy_inputs()
+        model_fp16 = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", torch_dtype=torch.float16
+        )
+        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
+        config = BitsAndBytesConfig(load_in_8bit=True)
+        model_8bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", quantization_config=config, device_map=torch_device
+        )
+        quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs)
+        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
+        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
+
 
 class Bnb8bitDeviceTests(Base8bitTests):
     def setUp(self) -> None:
diff --git a/tests/quantization/quanto/__init__.py b/tests/quantization/quanto/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py
index 89a56c15ed24..7072ddbcad10 100644
--- a/tests/quantization/quanto/test_quanto.py
+++ b/tests/quantization/quanto/test_quanto.py
@@ -19,29 +19,8 @@
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
-
-    class LoRALayer(nn.Module):
-        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only
-
-        Taken from
-        https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77
-        """
-
-        def __init__(self, module: nn.Module, rank: int):
-            super().__init__()
-            self.module = module
-            self.adapter = nn.Sequential(
-                nn.Linear(module.in_features, rank, bias=False),
-                nn.Linear(rank, module.out_features, bias=False),
-            )
-            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
-            nn.init.normal_(self.adapter[0].weight, std=small_std)
-            nn.init.zeros_(self.adapter[1].weight)
-            self.adapter.to(module.weight.device)
-
-        def forward(self, input, *args, **kwargs):
-            return self.module(input, *args, **kwargs) + self.adapter(input)
+    
+    from ..utils import LoRALayer, get_memory_consumption_stat
 
 
 @nightly
@@ -86,19 +65,14 @@ def test_quanto_layers(self):
 
     def test_quanto_memory_usage(self):
         unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype)
-        unquantized_model_memory = unquantized_model.get_memory_footprint() / 1024**3
-
-        model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs())
         inputs = self.get_dummy_inputs()
+        unquantized_model_memory = get_memory_consumption_stat(unquantized_model, inputs)
 
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.empty_cache()
+        quantized_model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs())
+        quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs)
 
-        model.to(torch_device)
-        with torch.no_grad():
-            model(**inputs)
-        max_memory = torch.cuda.max_memory_allocated() / 1024**3
-        assert (1.0 - (max_memory / unquantized_model_memory)) >= self.expected_memory_reduction
+        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
+        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= self.expected_memory_reduction
 
     def test_keep_modules_in_fp32(self):
         r"""
diff --git a/tests/quantization/torchao/__init__.py b/tests/quantization/torchao/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index e14a1cc0369e..72a38ce9e7c3 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -50,27 +50,7 @@
     import torch
     import torch.nn as nn
 
-    class LoRALayer(nn.Module):
-        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only
-
-        Taken from
-        https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77
-        """
-
-        def __init__(self, module: nn.Module, rank: int):
-            super().__init__()
-            self.module = module
-            self.adapter = nn.Sequential(
-                nn.Linear(module.in_features, rank, bias=False),
-                nn.Linear(rank, module.out_features, bias=False),
-            )
-            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
-            nn.init.normal_(self.adapter[0].weight, std=small_std)
-            nn.init.zeros_(self.adapter[1].weight)
-            self.adapter.to(module.weight.device)
-
-        def forward(self, input, *args, **kwargs):
-            return self.module(input, *args, **kwargs) + self.adapter(input)
+    from ..utils import LoRALayer, get_memory_consumption_stat
 
 
 if is_torchao_available():
@@ -503,6 +483,17 @@ def test_memory_footprint(self):
             # there is additional overhead of scales and zero points
             self.assertTrue(total_bf16 < total_int4wo)
 
+    def test_memory_usage(self):
+        model_id = "hf-internal-testing/tiny-flux-pipe"
+        inputs = self.get_dummy_inputs()
+        transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"]
+        unquantized_model_memory = get_memory_consumption_stat(transformer_bf16, inputs)
+
+        transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"), model_id=model_id)["transformer"]
+        quantized_model_memory = get_memory_consumption_stat(transformer_int8wo, inputs)
+        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
+        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
+
     def test_wrong_config(self):
         with self.assertRaises(ValueError):
             self.get_dummy_components(TorchAoConfig("int42"))
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
new file mode 100644
index 000000000000..77fbf6463d8d
--- /dev/null
+++ b/tests/quantization/utils.py
@@ -0,0 +1,39 @@
+from diffusers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    class LoRALayer(nn.Module):
+        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only
+
+        Taken from
+        https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77
+        """
+
+        def __init__(self, module: nn.Module, rank: int):
+            super().__init__()
+            self.module = module
+            self.adapter = nn.Sequential(
+                nn.Linear(module.in_features, rank, bias=False),
+                nn.Linear(rank, module.out_features, bias=False),
+            )
+            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
+            nn.init.normal_(self.adapter[0].weight, std=small_std)
+            nn.init.zeros_(self.adapter[1].weight)
+            self.adapter.to(module.weight.device)
+
+        def forward(self, input, *args, **kwargs):
+            return self.module(input, *args, **kwargs) + self.adapter(input)
+        
+    
+    @torch.no_grad()
+    @torch.inference_mode()
+    def get_memory_consumption_stat(model, inputs):
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+
+        model(**inputs)
+        max_memory_mem_allocated = torch.cuda.max_memory_allocated()
+        return max_memory_mem_allocated

From a1b70d1c8b34cc548fca4bc23044ed56a4d52e0b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 10 Mar 2025 18:00:47 +0530
Subject: [PATCH 2/3] fixes

---
 .../quantizers/bitsandbytes/bnb_quantizer.py  |  2 +
 .../quantizers/torchao/torchao_quantizer.py   |  1 +
 tests/quantization/bnb/test_4bit.py           | 57 +++++++++++--------
 tests/quantization/bnb/test_mixed_int8.py     | 48 +++++++++-------
 tests/quantization/quanto/test_quanto.py      | 17 ++++--
 tests/quantization/torchao/test_torchao.py    | 13 +++--
 tests/quantization/utils.py                   |  3 +-
 7 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
index ada75588a42a..f4aa1504534c 100644
--- a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
+++ b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
@@ -135,6 +135,7 @@ def create_quantized_param(
         target_device: "torch.device",
         state_dict: Dict[str, Any],
         unexpected_keys: Optional[List[str]] = None,
+        **kwargs,
     ):
         import bitsandbytes as bnb
 
@@ -445,6 +446,7 @@ def create_quantized_param(
         target_device: "torch.device",
         state_dict: Dict[str, Any],
         unexpected_keys: Optional[List[str]] = None,
+        **kwargs,
     ):
         import bitsandbytes as bnb
 
diff --git a/src/diffusers/quantizers/torchao/torchao_quantizer.py b/src/diffusers/quantizers/torchao/torchao_quantizer.py
index e86ce2f64278..03cb29c6f037 100644
--- a/src/diffusers/quantizers/torchao/torchao_quantizer.py
+++ b/src/diffusers/quantizers/torchao/torchao_quantizer.py
@@ -215,6 +215,7 @@ def create_quantized_param(
         target_device: "torch.device",
         state_dict: Dict[str, Any],
         unexpected_keys: List[str],
+        **kwargs,
     ):
         r"""
         Each nn.Linear layer that needs to be quantized is processsed here. First, we set the value the weight tensor,
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 809c356c441a..97047717cd83 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -75,6 +75,8 @@ class Base4bitTests(unittest.TestCase):
     # This was obtained on audace so the number might slightly change
     expected_rel_difference = 3.69
 
+    expected_memory_saving_ratio = 0.8
+
     prompt = "a beautiful sunset amidst the mountains."
     num_inference_steps = 10
     seed = 0
@@ -119,8 +121,10 @@ def setUp(self):
         )
 
     def tearDown(self):
-        del self.model_fp16
-        del self.model_4bit
+        if hasattr(self, "model_fp16"):
+            del self.model_fp16
+        if hasattr(self, "model_4bit"):
+            del self.model_4bit
 
         gc.collect()
         torch.cuda.empty_cache()
@@ -159,6 +163,32 @@ def test_memory_footprint(self):
         linear = get_some_linear_layer(self.model_4bit)
         self.assertTrue(linear.weight.__class__ == bnb.nn.Params4bit)
 
+    def test_model_memory_usage(self):
+        # Delete to not let anything interfere.
+        del self.model_4bit, self.model_fp16
+
+        # Re-instantiate.
+        inputs = self.get_dummy_inputs()
+        inputs = {
+            k: v.to(device=torch_device, dtype=torch.float16) for k, v in inputs.items() if not isinstance(v, bool)
+        }
+        model_fp16 = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", torch_dtype=torch.float16
+        ).to(torch_device)
+        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
+        del model_fp16
+
+        nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        model_4bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", quantization_config=nf4_config, torch_dtype=torch.float16
+        )
+        quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs)
+        assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_saving_ratio
+
     def test_original_dtype(self):
         r"""
         A simple test to check if the model succesfully stores the original dtype
@@ -329,29 +359,6 @@ def test_bnb_4bit_errors_loading_incorrect_state_dict(self):
 
             assert key_to_target in str(err_context.exception)
 
-    def test_model_memory_usage(self):
-        # Delete to not let anything interfere.
-        del self.model_4bit, self.model_fp16
-        
-        # Re-instantiate.
-        inputs = self.get_dummy_inputs()
-        model_fp16 = SD3Transformer2DModel.from_pretrained(
-            self.model_name, subfolder="transformer", torch_dtype=torch.float16
-        )
-        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
-        nf4_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
-        )
-        model_4bit = SD3Transformer2DModel.from_pretrained(
-            self.model_name, subfolder="transformer", quantization_config=nf4_config, device_map=torch_device
-        )
-        quantized_model_memory = get_memory_consumption_stat(model_4bit, inputs)
-        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
-        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
-
-
 
 class BnB4BitTrainingTests(Base4bitTests):
     def setUp(self):
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index b8b4b1598eb7..4964f8c9af07 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -81,6 +81,8 @@ class Base8bitTests(unittest.TestCase):
     # This was obtained on audace so the number might slightly change
     expected_rel_difference = 1.94
 
+    expected_memory_saving_ratio = 0.7
+
     prompt = "a beautiful sunset amidst the mountains."
     num_inference_steps = 10
     seed = 0
@@ -121,8 +123,10 @@ def setUp(self):
         )
 
     def tearDown(self):
-        del self.model_fp16
-        del self.model_8bit
+        if hasattr(self, "model_fp16"):
+            del self.model_fp16
+        if hasattr(self, "model_8bit"):
+            del self.model_8bit
 
         gc.collect()
         torch.cuda.empty_cache()
@@ -161,6 +165,28 @@ def test_memory_footprint(self):
         linear = get_some_linear_layer(self.model_8bit)
         self.assertTrue(linear.weight.__class__ == bnb.nn.Int8Params)
 
+    def test_model_memory_usage(self):
+        # Delete to not let anything interfere.
+        del self.model_8bit, self.model_fp16
+
+        # Re-instantiate.
+        inputs = self.get_dummy_inputs()
+        inputs = {
+            k: v.to(device=torch_device, dtype=torch.float16) for k, v in inputs.items() if not isinstance(v, bool)
+        }
+        model_fp16 = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", torch_dtype=torch.float16
+        ).to(torch_device)
+        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
+        del model_fp16
+
+        config = BitsAndBytesConfig(load_in_8bit=True)
+        model_8bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name, subfolder="transformer", quantization_config=config, torch_dtype=torch.float16
+        )
+        quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs)
+        assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_saving_ratio
+
     def test_original_dtype(self):
         r"""
         A simple test to check if the model succesfully stores the original dtype
@@ -287,24 +313,6 @@ def test_device_and_dtype_assignment(self):
         # Check that this does not throw an error
         _ = self.model_fp16.cuda()
 
-    def test_model_memory_usage(self):
-        # Delete to not let anything interfere.
-        del self.model_4bit, self.model_fp16
-        
-        # Re-instantiate.
-        inputs = self.get_dummy_inputs()
-        model_fp16 = SD3Transformer2DModel.from_pretrained(
-            self.model_name, subfolder="transformer", torch_dtype=torch.float16
-        )
-        unquantized_model_memory = get_memory_consumption_stat(model_fp16, inputs)
-        config = BitsAndBytesConfig(load_in_8bit=True)
-        model_8bit = SD3Transformer2DModel.from_pretrained(
-            self.model_name, subfolder="transformer", quantization_config=config, device_map=torch_device
-        )
-        quantized_model_memory = get_memory_consumption_stat(model_8bit, inputs)
-        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
-        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
-
 
 class Bnb8bitDeviceTests(Base8bitTests):
     def setUp(self) -> None:
diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py
index 7072ddbcad10..51ca0bfdc0ab 100644
--- a/tests/quantization/quanto/test_quanto.py
+++ b/tests/quantization/quanto/test_quanto.py
@@ -19,7 +19,7 @@
 
 if is_torch_available():
     import torch
-    
+
     from ..utils import LoRALayer, get_memory_consumption_stat
 
 
@@ -64,15 +64,20 @@ def test_quanto_layers(self):
                 assert isinstance(module, QLinear)
 
     def test_quanto_memory_usage(self):
-        unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype)
         inputs = self.get_dummy_inputs()
+        inputs = {
+            k: v.to(device=torch_device, dtype=torch.bfloat16) for k, v in inputs.items() if not isinstance(v, bool)
+        }
+
+        unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype)
+        unquantized_model.to(torch_device)
         unquantized_model_memory = get_memory_consumption_stat(unquantized_model, inputs)
 
         quantized_model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs())
+        quantized_model.to(torch_device)
         quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs)
 
-        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
-        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= self.expected_memory_reduction
+        assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_reduction
 
     def test_keep_modules_in_fp32(self):
         r"""
@@ -292,14 +297,14 @@ def test_training(self):
 
 
 class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase):
-    expected_memory_reduction = 0.3
+    expected_memory_reduction = 0.6
 
     def get_dummy_init_kwargs(self):
         return {"weights_dtype": "float8"}
 
 
 class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase):
-    expected_memory_reduction = 0.3
+    expected_memory_reduction = 0.6
     _test_torch_compile = True
 
     def get_dummy_init_kwargs(self):
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 72a38ce9e7c3..0e671307dd18 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -483,16 +483,21 @@ def test_memory_footprint(self):
             # there is additional overhead of scales and zero points
             self.assertTrue(total_bf16 < total_int4wo)
 
-    def test_memory_usage(self):
+    def test_model_memory_usage(self):
         model_id = "hf-internal-testing/tiny-flux-pipe"
-        inputs = self.get_dummy_inputs()
+        expected_memory_saving_ratio = 2.0
+
+        inputs = self.get_dummy_tensor_inputs(device=torch_device)
+
         transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"]
+        transformer_bf16.to(torch_device)
         unquantized_model_memory = get_memory_consumption_stat(transformer_bf16, inputs)
+        del transformer_bf16
 
         transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"), model_id=model_id)["transformer"]
+        transformer_int8wo.to(torch_device)
         quantized_model_memory = get_memory_consumption_stat(transformer_int8wo, inputs)
-        print(f"{unquantized_model_memory=}, {quantized_model_memory=}")
-        assert (1.0 - (unquantized_model_memory / quantized_model_memory)) >= 100.
+        assert unquantized_model_memory / quantized_model_memory >= expected_memory_saving_ratio
 
     def test_wrong_config(self):
         with self.assertRaises(ValueError):
diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py
index 77fbf6463d8d..04ebf9e159f4 100644
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -26,8 +26,7 @@ def __init__(self, module: nn.Module, rank: int):
 
         def forward(self, input, *args, **kwargs):
             return self.module(input, *args, **kwargs) + self.adapter(input)
-        
-    
+
     @torch.no_grad()
     @torch.inference_mode()
     def get_memory_consumption_stat(model, inputs):

From c0aac70ee1411224eafa968c6ee531b0964543c2 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 10 Mar 2025 18:04:39 +0530
Subject: [PATCH 3/3] gguf

---
 src/diffusers/quantizers/gguf/gguf_quantizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
index 0c760e277ce4..6da69c7bd60c 100644
--- a/src/diffusers/quantizers/gguf/gguf_quantizer.py
+++ b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -108,6 +108,7 @@ def create_quantized_param(
         target_device: "torch.device",
         state_dict: Optional[Dict[str, Any]] = None,
         unexpected_keys: Optional[List[str]] = None,
+        **kwargs,
     ):
         module, tensor_name = get_module_from_name(model, param_name)
         if tensor_name not in module._parameters and tensor_name not in module._buffers: