From ff50418472b1477c9c6bf8224405835931ebafbd Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 5 Feb 2025 14:28:57 +0100 Subject: [PATCH 01/34] update --- src/diffusers/__init__.py | 23 +++++++++- src/diffusers/quantizers/auto.py | 4 ++ .../quantizers/quantization_config.py | 43 +++++++++++++++++++ src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/import_utils.py | 19 ++++++++ 5 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index c36226225ad4..a099ccd5d539 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -11,6 +11,7 @@ is_librosa_available, is_note_seq_available, is_onnx_available, + is_optimum_quanto_available, is_scipy_available, is_sentencepiece_available, is_torch_available, @@ -32,7 +33,12 @@ "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], - "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig", "TorchAoConfig"], + "quantizers.quantization_config": [ + "BitsAndBytesConfig", + "GGUFQuantizationConfig", + "QuantoConfig", + "TorchAoConfig", + ], "schedulers": [], "utils": [ "OptionalDependencyNotAvailable", @@ -54,6 +60,19 @@ ], } +""" +try: + if not is_optimum_quanto_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_quanto_objects # noqa F403 + + _import_structure["utils.dummy_quanto_objects"] = [ + name for name in dir(dummy_quanto_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].extend("QuantoConfig") +""" try: if not is_onnx_available(): raise OptionalDependencyNotAvailable() @@ -581,7 +600,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: from .configuration_utils import ConfigMixin - from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig, TorchAoConfig + from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig, QuantoConfig, TorchAoConfig try: if not is_onnx_available(): diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index d9874cc282ae..e40ba51ccf14 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -21,7 +21,9 @@ from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer from .gguf import GGUFQuantizer +from .quanto import QuantoQuantizer from .quantization_config import ( + QuantoConfig, BitsAndBytesConfig, GGUFQuantizationConfig, QuantizationConfigMixin, @@ -36,6 +38,7 @@ "bitsandbytes_8bit": BnB8BitDiffusersQuantizer, "gguf": GGUFQuantizer, "torchao": TorchAoHfQuantizer, + "quanto": QuantoQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -43,6 +46,7 @@ "bitsandbytes_8bit": BitsAndBytesConfig, "gguf": GGUFQuantizationConfig, "torchao": TorchAoConfig, + "quanto": QuantoConfig, } diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index a6e4dd9ff5e5..3cd2193876ba 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -45,6 +45,7 @@ class QuantizationMethod(str, Enum): BITS_AND_BYTES = "bitsandbytes" GGUF = "gguf" TORCHAO = "torchao" + QUANTO = "quanto" @dataclass @@ -674,3 +675,45 @@ def __repr__(self): """ config_dict = self.to_dict() return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n" + + +@dataclass +class QuantoConfig(QuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `quanto`. + + Args: + weights (`str`, *optional*, defaults to `"int8"`): + The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2") + activations (`str`, *optional*): + The target dtype for the activations after quantization. Supported values are (None,"int8","float8") + modules_to_not_convert (`list`, *optional*, default to `None`): + The list of modules to not quantize, useful for quantizing models that explicitly require to have some + modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). + """ + + def __init__( + self, + weights="int8", + activations=None, + modules_to_not_convert: Optional[List] = None, + **kwargs, + ): + self.quant_method = QuantizationMethod.QUANTO + self.weights = weights + self.activations = activations + self.modules_to_not_convert = modules_to_not_convert + self.post_init() + + def post_init(self): + r""" + Safety checker that arguments are correct + """ + accepted_weights = ["float8", "int8", "int4", "int2"] + accepted_activations = [None, "int8", "float8"] + if self.weights not in accepted_weights: + raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}") + + if self.activations not in accepted_activations: + raise ValueError(f"Only support weights in {accepted_activations} but found {self.activations}") diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index d82aded4c435..db50cfee6aba 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -79,6 +79,7 @@ is_matplotlib_available, is_note_seq_available, is_onnx_available, + is_optimum_quanto_available, is_peft_available, is_peft_version, is_safetensors_available, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 37535366ed44..ebf394623c6f 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -365,6 +365,15 @@ def is_timm_available(): _is_torchao_available = False +_is_optimum_quanto_available = importlib.util.find_spec("optimum") is not None +if _is_optimum_quanto_available: + try: + _optimum_quanto_version = importlib_metadata.version("optimum_quanto") + logger.debug(f"Successfully import optimum-quanto version {_optimum_quanto_version}") + except importlib_metadata.PackageNotFoundError: + _is_optimum_quanto_available = False + + def is_torch_available(): return _torch_available @@ -493,6 +502,10 @@ def is_torchao_available(): return _is_torchao_available +def is_optimum_quanto_available(): + return _is_optimum_quanto_available + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -636,6 +649,11 @@ def is_torchao_available(): torchao` """ +QUANTO_IMPORT_ERROR = """ +{0} requires the optimum-quanto library but it was not found in your environment. You can install it with pip: `pip +install optimum-quanto` +""" + BACKENDS_MAPPING = OrderedDict( [ ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), @@ -663,6 +681,7 @@ def is_torchao_available(): ("imageio", (is_imageio_available, IMAGEIO_IMPORT_ERROR)), ("gguf", (is_gguf_available, GGUF_IMPORT_ERROR)), ("torchao", (is_torchao_available, TORCHAO_IMPORT_ERROR)), + ("quanto", (is_optimum_quanto_available, QUANTO_IMPORT_ERROR)), ] ) From ba5bba74f2f6d5fb88b4fc17e3eac538b4fa5ad3 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 5 Feb 2025 14:29:11 +0100 Subject: [PATCH 02/34] updaet --- src/diffusers/quantizers/quanto/__init__.py | 1 + .../quantizers/quanto/quanto_quantizer.py | 136 ++++++++++++++ src/diffusers/quantizers/quanto/utils.py | 62 +++++++ tests/quantization/quanto/test_quanto.py | 174 ++++++++++++++++++ 4 files changed, 373 insertions(+) create mode 100644 src/diffusers/quantizers/quanto/__init__.py create mode 100644 src/diffusers/quantizers/quanto/quanto_quantizer.py create mode 100644 src/diffusers/quantizers/quanto/utils.py create mode 100644 tests/quantization/quanto/test_quanto.py diff --git a/src/diffusers/quantizers/quanto/__init__.py b/src/diffusers/quantizers/quanto/__init__.py new file mode 100644 index 000000000000..a4e8a1f41a1e --- /dev/null +++ b/src/diffusers/quantizers/quanto/__init__.py @@ -0,0 +1 @@ +from .quanto_quantizer import QuantoQuantizer diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py new file mode 100644 index 000000000000..ae58d0f6e5c4 --- /dev/null +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -0,0 +1,136 @@ +from typing import Any, Dict, List, Optional, Union + +import torch + +from ...utils import get_module_from_name, is_accelerate_available, is_accelerate_version, is_optimum_quanto_available +from ..base import DiffusersQuantizer + + +if is_accelerate_available(): + from accelerate.utils import CustomDtype, set_module_tensor_to_device + +if is_optimum_quanto_available(): + from .utils import _replace_with_quanto_layers + + +class QuantoQuantizer(DiffusersQuantizer): + r""" + Diffusers Quantizer for Optimum Quanto + """ + + requires_calibration = False + required_packages = ["quanto", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + def validate_environment(self, *args, **kwargs): + if not is_optimum_quanto_available(): + raise ImportError( + "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)" + ) + if not is_accelerate_available(): + raise ImportError( + "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" + ) + + def check_if_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + # Quanto imports diffusers internally. This is here to prevent circular imports + from optimum.quanto import QModuleMixin + + module, tensor_name = get_module_from_name(model, param_name) + if isinstance(module, QModuleMixin) and "weight" in tensor_name: + return not module.frozen + + return False + + def create_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + *args, + **kwargs, + ): + """ + Create the quantized parameter by calling .freeze() after setting it to the module. + """ + + set_module_tensor_to_device(model, param_name, target_device, param_value) + module, _ = get_module_from_name(model, param_name) + module.freeze() + module.weight.requires_grad = False + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if is_accelerate_version(">=0.27.0"): + mapping = { + "int8": torch.int8, + "float8": CustomDtype.FP8, + "int4": CustomDtype.INT4, + "int2": CustomDtype.INT2, + } + target_dtype = mapping[self.quantization_config.weights] + return target_dtype + else: + raise ValueError( + "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute" + " the appropriate device map, you should upgrade your `accelerate` library," + "`pip install --upgrade accelerate` or install it from source." + ) + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + # Quanto imports diffusers internally. This is here to prevent circular imports + from optimum.quanto import QModuleMixin + + not_missing_keys = [] + for name, module in model.named_modules(): + if isinstance(module, QModuleMixin): + for missing in missing_keys: + if ( + (name in missing or name in f"{prefix}.{missing}") + and not missing.endswith(".weight") + and not missing.endswith(".bias") + ): + not_missing_keys.append(missing) + return [k for k in missing_keys if k not in not_missing_keys] + + def _process_model_before_weight_loading( + self, + model: "ModelMixin", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + self.modules_to_not_convert = self.quantization_config.modules_to_not_convert + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + + model = _replace_with_quanto_layers( + model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model, **kwargs): + return model + + @property + def is_trainable(self, model: Optional["ModelMixin"] = None): + return True + + def is_serializable(self, safe_serialization=None): + return False diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py new file mode 100644 index 000000000000..f2f97a0c8806 --- /dev/null +++ b/src/diffusers/quantizers/quanto/utils.py @@ -0,0 +1,62 @@ +from typing import Optional + +import torch.nn as nn + +from ...utils import is_accelerate_available + + +if is_accelerate_available(): + from accelerate import init_empty_weights + + +def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list): + # Quanto imports diffusers internally. These are placed here to avoid circular imports + from optimum.quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8 + + def _get_weight_type(dtype: str): + return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] + + def _get_activation_type(dtype: Optional[str]): + return {None: None, "float8": qfloat8, "int8": qint8}[dtype] + + def _replace_layers(model, quantization_config, modules_to_not_convert): + has_children = list(model.children()) + if not has_children: + return model + + for name, module in model.named_children(): + _replace_layers(module, quantization_config, modules_to_not_convert) + + if name in modules_to_not_convert: + continue + + if isinstance(module, nn.Linear): + with init_empty_weights(): + model._modules[name] = QLinear( + in_features=module.in_features, + out_features=module.out_features, + bias=module.bias is not None, + dtype=module.weight.dtype, + weights=_get_weight_type(quantization_config.weights), + activations=_get_activation_type(quantization_config.activations), + ) + model._modules[name].source_cls = type(module) + model._modules[name].requires_grad_(False) + + elif isinstance(module, nn.LayerNorm) and quantization_config.activations is not None: + with init_empty_weights(): + model._modules[name] = QLayerNorm( + module.normalized_shape, + module.eps, + module.elementwise_affine, + module.bias is not None, + activations=_get_activation_type(quantization_config.activations), + ) + model._modules[name].source_cls = type(module) + model._modules[name].requires_grad_(False) + + return model + + model = _replace_layers(model, quantization_config, modules_to_not_convert) + + return model diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py new file mode 100644 index 000000000000..8c095f98fb5b --- /dev/null +++ b/tests/quantization/quanto/test_quanto.py @@ -0,0 +1,174 @@ +import unittest + +import torch + +from diffusers import ( + QuantoConfig, +) +from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel +from diffusers.utils import is_optimum_quanto_available +from diffusers.utils.testing_utils import ( + torch_device, + nightly, + require_accelerate, + require_big_gpu_with_torch_cuda, +) + + +if is_optimum_quanto_available(): + from optimum.quanto import QLayerNorm, QLinear + + +@nightly +@require_big_gpu_with_torch_cuda +@require_accelerate +class QuantoBaseTesterMixin: + model_id = None + model_cls = None + torch_dtype = torch.bfloat16 + expected_memory_use_in_gb = 5 + + def get_dummy_init_kwargs(self): + return {"weights": "float8"} + + def get_dummy_model_init_kwargs(self): + return { + "pretrained_model_name_or_path": self.model_id, + "torch_dtype": self.torch_dtype, + "quantization_config": QuantoConfig(**self.get_dummy_init_kwargs()), + } + + def test_quanto_layers(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + has_quantized_activations = model.hf_quantizer.quantization_config.activations is not None + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear): + assert isinstance(module, QLinear) + if isinstance(module, torch.nn.LayerNorm) and has_quantized_activations: + assert isinstance(module, QLayerNorm) + + def test_quanto_memory_usage(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb + inputs = self.get_dummy_inputs() + + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + with torch.no_grad(): + model(**inputs) + max_memory = torch.cuda.max_memory_allocated() + assert (max_memory / 1024**3) < self.expected_memory_use_in_gb + + def test_keep_modules_in_fp32(self): + r""" + A simple tests to check if the modules under `_keep_in_fp32_modules` are kept in fp32. + Also ensures if inference works. + """ + _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules + self.model_cls._keep_in_fp32_modules = ["proj_out"] + + init_kwargs = self.get_dummy_init_kwargs() + quantization_config = QuantoConfig(**init_kwargs) + + model = self.model_cls.from_pretrained( + self.model_id, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + model.to("cuda") + + assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear): + if name in model._keep_in_fp32_modules: + assert module.weight.dtype == torch.float32 + self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules + + def test_dtype_assignment(self): + init_kwargs = self.get_dummy_init_kwargs() + quantization_config = QuantoConfig(**init_kwargs) + + model = self.model_cls.from_pretrained( + self.model_id, quantization_config=quantization_config, torch_dtype=self.torch_dtype + ) + assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb + + with self.assertRaises(ValueError): + # Tries with a `dtype` + model.to(torch.float16) + + with self.assertRaises(ValueError): + # Tries with a `device` and `dtype` + model.to(device="cuda:0", dtype=torch.float16) + + with self.assertRaises(ValueError): + # Tries with a cast + model.float() + + with self.assertRaises(ValueError): + # Tries with a cast + model.half() + + # This should work + model.to("cuda") + + +class FluxTransformerFloat8(QuantoBaseTesterMixin, unittest.TestCase): + model_id = "hf-internal-testing/tiny-flux-transformer" + model_cls = FluxTransformer2DModel + torch_dtype = torch.bfloat16 + expected_memory_use_in_gb = 10 + + def get_dummy_init_kwargs(self): + return {"weights": "float8", "activations": "float8"} + + def get_dummy_inputs(self): + return { + "hidden_states": torch.randn((1, 4096, 64), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "encoder_hidden_states": torch.randn( + (1, 512, 4096), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "pooled_projections": torch.randn( + (1, 768), + generator=torch.Generator("cpu").manual_seed(0), + ).to(torch_device, self.torch_dtype), + "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype), + "img_ids": torch.randn((4096, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "txt_ids": torch.randn((512, 3), generator=torch.Generator("cpu").manual_seed(0)).to( + torch_device, self.torch_dtype + ), + "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), + } + + +class FluxTransformerInt8(QuantoBaseTesterMixin, unittest.TestCase): + model_id = "hf-internal-testing/tiny-flux-transformer" + model_cls = FluxTransformer2DModel + torch_dtype = torch.bfloat16 + expected_memory_use_in_gb = 10 + + def get_dummy_init_kwargs(self): + return {"weights": "int8"} + + +class FluxTransformerInt4(QuantoBaseTesterMixin, unittest.TestCase): + model_id = "black-forest-labs/FLUX.1-dev" + model_cls = FluxTransformer2DModel + torch_dtype = torch.bfloat16 + expected_memory_use_in_gb = 5 + + def get_dummy_init_kwargs(self): + return {"weights": "int4"} + + +class FluxTransformerInt2(QuantoBaseTesterMixin, unittest.TestCase): + model_id = "black-forest-labs/FLUX.1-dev" + model_cls = FluxTransformer2DModel + torch_dtype = torch.bfloat16 + expected_memory_use_in_gb = 5 + + def get_dummy_init_kwargs(self): + return {"weights": "int2"} From aa8cdaf05639f93fd7fa66af5b27202e89b3c179 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 5 Feb 2025 18:18:30 +0100 Subject: [PATCH 03/34] update --- src/diffusers/quantizers/quanto/utils.py | 12 ------------ tests/quantization/quanto/test_quanto.py | 3 --- 2 files changed, 15 deletions(-) diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index f2f97a0c8806..e4a6d2c29a43 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -43,18 +43,6 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): model._modules[name].source_cls = type(module) model._modules[name].requires_grad_(False) - elif isinstance(module, nn.LayerNorm) and quantization_config.activations is not None: - with init_empty_weights(): - model._modules[name] = QLayerNorm( - module.normalized_shape, - module.eps, - module.elementwise_affine, - module.bias is not None, - activations=_get_activation_type(quantization_config.activations), - ) - model._modules[name].source_cls = type(module) - model._modules[name].requires_grad_(False) - return model model = _replace_layers(model, quantization_config, modules_to_not_convert) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 8c095f98fb5b..9cbe5e394df0 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -40,12 +40,9 @@ def get_dummy_model_init_kwargs(self): def test_quanto_layers(self): model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - has_quantized_activations = model.hf_quantizer.quantization_config.activations is not None for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): assert isinstance(module, QLinear) - if isinstance(module, torch.nn.LayerNorm) and has_quantized_activations: - assert isinstance(module, QLayerNorm) def test_quanto_memory_usage(self): model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) From 39e20e240593443669a03e077e0429409189daa2 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Sat, 8 Feb 2025 10:51:57 +0100 Subject: [PATCH 04/34] update --- src/diffusers/__init__.py | 2 +- src/diffusers/models/model_loading_utils.py | 6 ++- src/diffusers/models/modeling_utils.py | 2 +- src/diffusers/quantizers/auto.py | 4 +- .../quantizers/quantization_config.py | 2 + .../quantizers/quanto/quanto_quantizer.py | 35 ++++++++++-- src/diffusers/quantizers/quanto/utils.py | 5 +- src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/import_utils.py | 15 ++++++ tests/quantization/quanto/test_quanto.py | 54 +++++++------------ 10 files changed, 81 insertions(+), 45 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a099ccd5d539..67960e6cf100 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -65,7 +65,7 @@ if not is_optimum_quanto_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils import dummy_quanto_objects # noqa F403 + from .utils import dummy_quanto_objects # noqa F403 _import_structure["utils.dummy_quanto_objects"] = [ name for name in dir(dummy_quanto_objects) if not name.startswith("_") diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 7e7445ef1239..969cf16b3c5e 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -220,7 +220,7 @@ def load_model_dict_into_meta( and any( module_to_keep_in_fp32 in param_name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules ) - and dtype == torch.float16 + and dtype in [torch.float16, torch.bfloat16] ): param = param.to(torch.float32) if accepts_dtype: @@ -248,7 +248,9 @@ def load_model_dict_into_meta( if is_quantized and ( hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device) ): - hf_quantizer.create_quantized_param(model, param, param_name, device, state_dict, unexpected_keys) + hf_quantizer.create_quantized_param( + model, param, param_name, device, state_dict, unexpected_keys, dtype=dtype + ) else: if accepts_dtype: set_module_tensor_to_device(model, param_name, device, value=param, **set_module_kwargs) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 3ef40ffb5783..012c080d7581 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -866,7 +866,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # Check if `_keep_in_fp32_modules` is not None use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( - (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") + (torch_dtype in [torch.float16, torch.bfloat16]) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") ) if use_keep_in_fp32_modules: keep_in_fp32_modules = cls._keep_in_fp32_modules diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index e40ba51ccf14..687b29470072 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -21,15 +21,15 @@ from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer from .gguf import GGUFQuantizer -from .quanto import QuantoQuantizer from .quantization_config import ( - QuantoConfig, BitsAndBytesConfig, GGUFQuantizationConfig, QuantizationConfigMixin, QuantizationMethod, + QuantoConfig, TorchAoConfig, ) +from .quanto import QuantoQuantizer from .torchao import TorchAoHfQuantizer diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 3cd2193876ba..ff7dd1e15417 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -698,12 +698,14 @@ def __init__( weights="int8", activations=None, modules_to_not_convert: Optional[List] = None, + compute_dtype: Optional["torch.dtype"] = None, **kwargs, ): self.quant_method = QuantizationMethod.QUANTO self.weights = weights self.activations = activations self.modules_to_not_convert = modules_to_not_convert + self.post_init() def post_init(self): diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index ae58d0f6e5c4..5c54092a9d27 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -1,23 +1,37 @@ -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import torch -from ...utils import get_module_from_name, is_accelerate_available, is_accelerate_version, is_optimum_quanto_available +from ...utils import ( + get_module_from_name, + is_accelerate_available, + is_accelerate_version, + is_optimum_quanto_available, + is_optimum_quanto_version, + logging, +) from ..base import DiffusersQuantizer +if TYPE_CHECKING: + from ...models.modeling_utils import ModelMixin + + if is_accelerate_available(): from accelerate.utils import CustomDtype, set_module_tensor_to_device if is_optimum_quanto_available(): from .utils import _replace_with_quanto_layers +logger = logging.get_logger(__name__) + class QuantoQuantizer(DiffusersQuantizer): r""" Diffusers Quantizer for Optimum Quanto """ + use_keep_in_fp32_modules = True requires_calibration = False required_packages = ["quanto", "accelerate"] @@ -29,6 +43,10 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)" ) + if not is_optimum_quanto_version(">=", "0.2.6"): + raise RuntimeError( + "The minimum required version of `optimum-quanto` is 0.2.6. Please upgrade with `pip install -U optimum-quanto`." + ) if not is_accelerate_available(): raise ImportError( "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" @@ -63,8 +81,9 @@ def create_quantized_param( """ Create the quantized parameter by calling .freeze() after setting it to the module. """ + dtype = kwargs.get("dtype", torch.float32) - set_module_tensor_to_device(model, param_name, target_device, param_value) + set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) module, _ = get_module_from_name(model, param_name) module.freeze() module.weight.requires_grad = False @@ -90,6 +109,12 @@ def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": "`pip install --upgrade accelerate` or install it from source." ) + def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype": + if torch_dtype is None: + logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") + torch_dtype = torch.float32 + return torch_dtype + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: # Quanto imports diffusers internally. This is here to prevent circular imports from optimum.quanto import QModuleMixin @@ -128,6 +153,10 @@ def _process_model_before_weight_loading( def _process_model_after_weight_loading(self, model, **kwargs): return model + def _dequantize(self, model): + logger.warning("Dequantizing the full model is currently not supported with the Quanto backend") + return + @property def is_trainable(self, model: Optional["ModelMixin"] = None): return True diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index e4a6d2c29a43..9367a9306671 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -11,7 +11,7 @@ def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list): # Quanto imports diffusers internally. These are placed here to avoid circular imports - from optimum.quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8 + from optimum.quanto import QLinear, qfloat8, qint2, qint4, qint8 def _get_weight_type(dtype: str): return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] @@ -32,7 +32,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): if isinstance(module, nn.Linear): with init_empty_weights(): - model._modules[name] = QLinear( + qlinear = QLinear( in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None, @@ -40,6 +40,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): weights=_get_weight_type(quantization_config.weights), activations=_get_activation_type(quantization_config.activations), ) + model._modules[name] = qlinear model._modules[name].source_cls = type(module) model._modules[name].requires_grad_(False) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index db50cfee6aba..32b766875d07 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -80,6 +80,7 @@ is_note_seq_available, is_onnx_available, is_optimum_quanto_available, + is_optimum_quanto_version, is_peft_available, is_peft_version, is_safetensors_available, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index ebf394623c6f..329cdcb21d3a 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -883,6 +883,21 @@ def is_k_diffusion_version(operation: str, version: str): return compare_versions(parse(_k_diffusion_version), operation, version) +def is_optimum_quanto_version(operation: str, version: str): + """ + Compares the current Accelerate version to a given reference with an operation. + + Args: + operation (`str`): + A string representation of an operator, such as `">"` or `"<="` + version (`str`): + A version string + """ + if not _is_optimum_quanto_available: + return False + return compare_versions(parse(_optimum_quanto_version), operation, version) + + def get_objects_from_module(module): """ Returns a dict of object names and values in a module, while skipping private/internal objects diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 9cbe5e394df0..213c8f44fdb1 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -8,15 +8,15 @@ from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel from diffusers.utils import is_optimum_quanto_available from diffusers.utils.testing_utils import ( - torch_device, nightly, require_accelerate, require_big_gpu_with_torch_cuda, + torch_device, ) if is_optimum_quanto_available(): - from optimum.quanto import QLayerNorm, QLinear + from optimum.quanto import QLinear @nightly @@ -27,6 +27,7 @@ class QuantoBaseTesterMixin: model_cls = None torch_dtype = torch.bfloat16 expected_memory_use_in_gb = 5 + keep_in_fp32_module = "" def get_dummy_init_kwargs(self): return {"weights": "float8"} @@ -62,14 +63,9 @@ def test_keep_modules_in_fp32(self): Also ensures if inference works. """ _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules - self.model_cls._keep_in_fp32_modules = ["proj_out"] - - init_kwargs = self.get_dummy_init_kwargs() - quantization_config = QuantoConfig(**init_kwargs) + self.model_cls._keep_in_fp32_modules = self.keep_in_fp32_module - model = self.model_cls.from_pretrained( - self.model_id, quantization_config=quantization_config, torch_dtype=self.torch_dtype - ) + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) model.to("cuda") assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb @@ -80,12 +76,7 @@ def test_keep_modules_in_fp32(self): self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules def test_dtype_assignment(self): - init_kwargs = self.get_dummy_init_kwargs() - quantization_config = QuantoConfig(**init_kwargs) - - model = self.model_cls.from_pretrained( - self.model_id, quantization_config=quantization_config, torch_dtype=self.torch_dtype - ) + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb with self.assertRaises(ValueError): @@ -108,14 +99,11 @@ def test_dtype_assignment(self): model.to("cuda") -class FluxTransformerFloat8(QuantoBaseTesterMixin, unittest.TestCase): +class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): model_id = "hf-internal-testing/tiny-flux-transformer" model_cls = FluxTransformer2DModel torch_dtype = torch.bfloat16 - expected_memory_use_in_gb = 10 - - def get_dummy_init_kwargs(self): - return {"weights": "float8", "activations": "float8"} + keep_in_fp32_module = "proj_out" def get_dummy_inputs(self): return { @@ -141,31 +129,29 @@ def get_dummy_inputs(self): } -class FluxTransformerInt8(QuantoBaseTesterMixin, unittest.TestCase): - model_id = "hf-internal-testing/tiny-flux-transformer" - model_cls = FluxTransformer2DModel - torch_dtype = torch.bfloat16 +class FluxTransformerFloat8(FluxTransformerQuantoMixin, unittest.TestCase): + expected_memory_use_in_gb = 10 + + def get_dummy_init_kwargs(self): + return {"weights": "float8"} + + +class FluxTransformerInt8(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 def get_dummy_init_kwargs(self): return {"weights": "int8"} -class FluxTransformerInt4(QuantoBaseTesterMixin, unittest.TestCase): - model_id = "black-forest-labs/FLUX.1-dev" - model_cls = FluxTransformer2DModel - torch_dtype = torch.bfloat16 - expected_memory_use_in_gb = 5 +class FluxTransformerInt4(FluxTransformerQuantoMixin, unittest.TestCase): + expected_memory_use_in_gb = 6 def get_dummy_init_kwargs(self): return {"weights": "int4"} -class FluxTransformerInt2(QuantoBaseTesterMixin, unittest.TestCase): - model_id = "black-forest-labs/FLUX.1-dev" - model_cls = FluxTransformer2DModel - torch_dtype = torch.bfloat16 - expected_memory_use_in_gb = 5 +class FluxTransformerInt2(FluxTransformerQuantoMixin, unittest.TestCase): + expected_memory_use_in_gb = 6 def get_dummy_init_kwargs(self): return {"weights": "int2"} From f52050a39f0ed5b58659c84279010a4028ada11a Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Sat, 8 Feb 2025 10:52:12 +0100 Subject: [PATCH 05/34] update --- docs/source/en/quantization/quanto.md | 87 +++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 docs/source/en/quantization/quanto.md diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md new file mode 100644 index 000000000000..e31cb1bf9998 --- /dev/null +++ b/docs/source/en/quantization/quanto.md @@ -0,0 +1,87 @@ + + +# Quanto + +[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index) +It has been designed with versatility and simplicity in mind: + +- All features are available in eager mode (works with non-traceable models) +- Supports quantization aware training +- Quantized models are compatible with `torch.compile` +- Quantized models are Device agnostic (e.g CUDA,XPU,MPS,CPU) + +In order to use the Quanto backend, you will first need to install `optimum-quanto>=0.2.6` and `accelerate` + +```shell +pip install optimum-quanto accelerate +``` + +Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto. + +```python +import torch +from diffusers import FluxTransformer2DModel, QuantoConfig + +model_id = "black-forest-labs/FLUX.1-dev" +quantization_config = QuantoConfig(weights="float8") +transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) + +pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) +pipe.to("cuda") + +prompt = "A cat holding a sign that says hello world" +image = pipe( + prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512 +).images[0] +image.save("output.png") +``` +## Saving Quantized models + +Diffusers supports serializing and saving Quanto models using the `save_pretrained` method. +```python + +import torch +from diffusers import FluxTransformer2DModel, QuantoConfig + +model_id = "black-forest-labs/FLUX.1-dev" +quantization_config = QuantoConfig(weights="float8") +transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) + +# save quantized model to reuse +transformer.save_pretrained("") + +## Supported Quantization Types + +### Weights + +- float8 +- int8 +- int4 +- int2 + +### Activations +- float8 +- int8 + + +``` +``` +``` + + +``` + + + + From f4c14c222de86e787a3ce0f32b8b7b9eb59a0f3e Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 08:20:14 +0100 Subject: [PATCH 06/34] update --- docs/source/en/quantization/quanto.md | 61 ++++++++---- src/diffusers/__init__.py | 99 +++++++++++++++++-- src/diffusers/models/modeling_utils.py | 1 - .../utils/dummy_bitsandbytes_objects.py | 17 ++++ src/diffusers/utils/dummy_gguf_objects.py | 17 ++++ .../utils/dummy_optimum_quanto_objects.py | 17 ++++ src/diffusers/utils/dummy_torchao_objects.py | 17 ++++ tests/quantization/quanto/test_quanto.py | 20 ++++ 8 files changed, 221 insertions(+), 28 deletions(-) create mode 100644 src/diffusers/utils/dummy_bitsandbytes_objects.py create mode 100644 src/diffusers/utils/dummy_gguf_objects.py create mode 100644 src/diffusers/utils/dummy_optimum_quanto_objects.py create mode 100644 src/diffusers/utils/dummy_torchao_objects.py diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index e31cb1bf9998..0de767148abd 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. # Quanto -[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index) +[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index) It has been designed with versatility and simplicity in mind: - All features are available in eager mode (works with non-traceable models) @@ -27,10 +27,10 @@ In order to use the Quanto backend, you will first need to install `optimum-quan pip install optimum-quanto accelerate ``` -Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto. +Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto. ```python -import torch +import torch from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" @@ -46,12 +46,24 @@ image = pipe( ).images[0] image.save("output.png") ``` -## Saving Quantized models -Diffusers supports serializing and saving Quanto models using the `save_pretrained` method. +## Using `from_single_file` with the Quanto Backend + ```python +import torch +from diffusers import FluxTransformer2DModel, QuantoConfig + +ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors" +quantization_config = QuantoConfig(weights="float8") +transformer = FluxTransformer2DModel.from_single_file(ckpt_path, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +``` + +## Saving Quantized models + +Diffusers supports serializing and saving Quanto models using the `save_pretrained` method. -import torch +```python +import torch from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" @@ -59,11 +71,32 @@ quantization_config = QuantoConfig(weights="float8") transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) # save quantized model to reuse -transformer.save_pretrained("") +transformer.save_pretrained("") + +# you can reload your quantized model with +model = FluxTransformer2DModel.from_pretrained("") +``` + +## Using `torch.compile` with Quanto + +Currently the Quanto backend only supports `torch.compile` for `int8` weights and activations. + +```python +import torch +from diffusers import FluxTransformer2DModel, QuantoConfig + +model_id = "black-forest-labs/FLUX.1-dev" +quantization_config = QuantoConfig(weights="int8") +transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True) + +pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) +pipe.to("cuda") +``` ## Supported Quantization Types -### Weights +### Weights - float8 - int8 @@ -73,15 +106,3 @@ transformer.save_pretrained("") ### Activations - float8 - int8 - - -``` -``` -``` - - -``` - - - - diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 67960e6cf100..c22768a894e3 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -2,6 +2,15 @@ from typing import TYPE_CHECKING +from diffusers.quantizers import quantization_config +from diffusers.utils import dummy_gguf_objects +from diffusers.utils.import_utils import ( + is_bitsandbytes_available, + is_gguf_available, + is_optimum_quanto_version, + is_torchao_available, +) + from .utils import ( DIFFUSERS_SLOW_IMPORT, OptionalDependencyNotAvailable, @@ -33,12 +42,7 @@ "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], - "quantizers.quantization_config": [ - "BitsAndBytesConfig", - "GGUFQuantizationConfig", - "QuantoConfig", - "TorchAoConfig", - ], + "quantizers.quantization_config": [], "schedulers": [], "utils": [ "OptionalDependencyNotAvailable", @@ -73,6 +77,56 @@ else: _import_structure["quantizers.quantization_config"].extend("QuantoConfig") """ + +try: + if not is_bitsandbytes_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_bitsandbytes_objects + + _import_structure["utils.dummy_bitsandbytes_objects"] = [ + name for name in dir(dummy_bitsandbytes_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].append("BitsAndBytesConfig") + +try: + if not is_gguf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_gguf_objects + + _import_structure["utils.dummy_gguf_objects"] = [ + name for name in dir(dummy_gguf_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].append("GGUFQuantizationConfig") + +try: + if not is_torchao_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torchao_objects + + _import_structure["utils.dummy_torchao_bjects"] = [ + name for name in dir(dummy_torchao_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].append("TorchAoConfig") + +try: + if not is_optimum_quanto_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from utils import dummy_optimum_quanto_objects + + _import_structure["utils.dummy_optimum_quanto_objects"] = [ + name for name in dir(dummy_optimum_quanto_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].append("QuantoConfig") + + try: if not is_onnx_available(): raise OptionalDependencyNotAvailable() @@ -600,7 +654,38 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: from .configuration_utils import ConfigMixin - from .quantizers.quantization_config import BitsAndBytesConfig, GGUFQuantizationConfig, QuantoConfig, TorchAoConfig + + try: + if not is_bitsandbytes_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_bitsandbytes_objects import * + else: + from .quantizers.quantization_config import BitsAndBytesConfig + + try: + if not is_gguf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_gguf_objects import * + else: + from .quantizers.quantization_config import GGUFQuantizationConfig + + try: + if not is_torchao_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torchao_objects import * + else: + from .quantizers.quantization_config import TorchAoConfig + + try: + if not is_optimum_quanto_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_optimum_quanto_objects import * + else: + from .quantizers.quantization_config import QuantoConfig try: if not is_onnx_available(): diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 012c080d7581..71b4b889a861 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -1041,7 +1041,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P model, state_dict, device=param_device, - dtype=torch_dtype, model_name_or_path=pretrained_model_name_or_path, hf_quantizer=hf_quantizer, keep_in_fp32_modules=keep_in_fp32_modules, diff --git a/src/diffusers/utils/dummy_bitsandbytes_objects.py b/src/diffusers/utils/dummy_bitsandbytes_objects.py new file mode 100644 index 000000000000..2dc589428de9 --- /dev/null +++ b/src/diffusers/utils/dummy_bitsandbytes_objects.py @@ -0,0 +1,17 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class BitsAndBytesConfig(metaclass=DummyObject): + _backends = ["bitsandbytes"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["bitsandbytes"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["bitsandbytes"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["bitsandbytes"]) diff --git a/src/diffusers/utils/dummy_gguf_objects.py b/src/diffusers/utils/dummy_gguf_objects.py new file mode 100644 index 000000000000..4a6d9a060a13 --- /dev/null +++ b/src/diffusers/utils/dummy_gguf_objects.py @@ -0,0 +1,17 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class GGUFQuantizationConfig(metaclass=DummyObject): + _backends = ["gguf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["gguf"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["gguf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["gguf"]) diff --git a/src/diffusers/utils/dummy_optimum_quanto_objects.py b/src/diffusers/utils/dummy_optimum_quanto_objects.py new file mode 100644 index 000000000000..44f8eaffc246 --- /dev/null +++ b/src/diffusers/utils/dummy_optimum_quanto_objects.py @@ -0,0 +1,17 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class QuantoConfig(metaclass=DummyObject): + _backends = ["optimum_quanto"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["optimum_quanto"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["optimum_quanto"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["optimum_quanto"]) diff --git a/src/diffusers/utils/dummy_torchao_objects.py b/src/diffusers/utils/dummy_torchao_objects.py new file mode 100644 index 000000000000..16f0f6a55f64 --- /dev/null +++ b/src/diffusers/utils/dummy_torchao_objects.py @@ -0,0 +1,17 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class TorchAoConfig(metaclass=DummyObject): + _backends = ["torchao"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torchao"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torchao"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torchao"]) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 213c8f44fdb1..1132e610736f 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -9,6 +9,7 @@ from diffusers.utils import is_optimum_quanto_available from diffusers.utils.testing_utils import ( nightly, + numpy_cosine_similarity_distance, require_accelerate, require_big_gpu_with_torch_cuda, torch_device, @@ -142,6 +143,25 @@ class FluxTransformerInt8(FluxTransformerQuantoMixin, unittest.TestCase): def get_dummy_init_kwargs(self): return {"weights": "int8"} + def test_torch_compile(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) + inputs = self.get_dummy_inputs() + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**inputs).sample + model.to("cpu") + + compiled_model.to(torch_device) + with torch.no_grad(): + compiled_model_output = compiled_model(**inputs).sample + + max_diff = numpy_cosine_similarity_distance( + model_output.cpu().flatten(), compiled_model_output.cpu().flatten() + ) + assert max_diff < 1e-4 + class FluxTransformerInt4(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 6 From f67d97c0ac865e105b1845d662520ce68f1e8658 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 08:22:27 +0100 Subject: [PATCH 07/34] update --- src/diffusers/__init__.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index c22768a894e3..b3bcbfaea1ca 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -64,20 +64,6 @@ ], } -""" -try: - if not is_optimum_quanto_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from .utils import dummy_quanto_objects # noqa F403 - - _import_structure["utils.dummy_quanto_objects"] = [ - name for name in dir(dummy_quanto_objects) if not name.startswith("_") - ] -else: - _import_structure["quantizers.quantization_config"].extend("QuantoConfig") -""" - try: if not is_bitsandbytes_available(): raise OptionalDependencyNotAvailable() From 5cff237f7548966a6d11ada9151c61f16ae10c93 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 08:49:48 +0100 Subject: [PATCH 08/34] update --- src/diffusers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index b3bcbfaea1ca..b94c7f89656c 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -104,7 +104,7 @@ if not is_optimum_quanto_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from utils import dummy_optimum_quanto_objects + from .utils import dummy_optimum_quanto_objects _import_structure["utils.dummy_optimum_quanto_objects"] = [ name for name in dir(dummy_optimum_quanto_objects) if not name.startswith("_") From f734c096e7d21fdc9311dfeaa5ffd5339644affe Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 09:00:56 +0100 Subject: [PATCH 09/34] update --- src/diffusers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index b94c7f89656c..f28bc13c0efd 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -94,7 +94,7 @@ except OptionalDependencyNotAvailable: from .utils import dummy_torchao_objects - _import_structure["utils.dummy_torchao_bjects"] = [ + _import_structure["utils.dummy_torchao_objects"] = [ name for name in dir(dummy_torchao_objects) if not name.startswith("_") ] else: From 7472f18b9c0cfca91983506aff4a39c91cded9af Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 09:15:09 +0100 Subject: [PATCH 10/34] update --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/quantization/overview.md | 1 + 2 files changed, 3 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 752219b4abd1..9de7c566d10b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -163,6 +163,8 @@ title: gguf - local: quantization/torchao title: torchao + - local: quantization/quanto + title: quanto title: Quantization Methods - sections: - local: optimization/fp16 diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 794098e210a6..93323f86c7fc 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -36,5 +36,6 @@ Diffusers currently supports the following quantization methods. - [BitsandBytes](./bitsandbytes) - [TorchAO](./torchao) - [GGUF](./gguf) +- [Quanto](./quanto.md) [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. From e96686e9c9e3cd3e0a378e5f09f340b76e10cde2 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 09:34:37 +0100 Subject: [PATCH 11/34] update --- setup.py | 4 ++++ src/diffusers/dependency_versions_table.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/setup.py b/setup.py index 0acdcbbb9c52..f2eabe732c8e 100644 --- a/setup.py +++ b/setup.py @@ -127,6 +127,10 @@ "GitPython<3.1.19", "scipy", "onnx", + "optimum_quanto>=0.2.6", + "gguf>=0.10.0", + "torchao>=0.7.0", + "bitsandbytes>=0.43.3", "regex!=2019.12.17", "requests", "tensorboard", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 7999368f1417..63df191f852a 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -35,6 +35,10 @@ "GitPython": "GitPython<3.1.19", "scipy": "scipy", "onnx": "onnx", + "optimum_quanto": "optimum_quanto>=0.2.6", + "gguf": "gguf>=0.10.0", + "torchao": "torchao>=0.7.0", + "bitsandbytes": "bitsandbytes>=0.43.3", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", From 4ae86916b2d06a67251b3df6ce1482302fd1fcaa Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Feb 2025 09:46:40 +0100 Subject: [PATCH 12/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 5c54092a9d27..b22832f85cb9 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -1,13 +1,12 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union -import torch - from ...utils import ( get_module_from_name, is_accelerate_available, is_accelerate_version, is_optimum_quanto_available, is_optimum_quanto_version, + is_torch_available, logging, ) from ..base import DiffusersQuantizer @@ -17,6 +16,9 @@ from ...models.modeling_utils import ModelMixin +if is_torch_available(): + import torch + if is_accelerate_available(): from accelerate.utils import CustomDtype, set_module_tensor_to_device From 7b841dc52d536b11724c4ed4848d39c3e919f3dd Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 11 Feb 2025 11:28:10 +0530 Subject: [PATCH 13/34] Update docs/source/en/quantization/quanto.md Co-authored-by: Sayak Paul --- docs/source/en/quantization/quanto.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 0de767148abd..3e6baaefbe95 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -13,8 +13,7 @@ specific language governing permissions and limitations under the License. # Quanto -[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum.](https://huggingface.co/docs/optimum/en/index) -It has been designed with versatility and simplicity in mind: +[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum](https://huggingface.co/docs/optimum/en/index). It has been designed with versatility and simplicity in mind: - All features are available in eager mode (works with non-traceable models) - Supports quantization aware training From e0901777665daee9966a2beb3b0e58a992336c22 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 11 Feb 2025 12:34:13 +0100 Subject: [PATCH 14/34] update --- docs/source/en/quantization/quanto.md | 13 ++++ src/diffusers/models/modeling_utils.py | 2 +- src/diffusers/quantizers/quanto/utils.py | 12 +++- tests/quantization/quanto/test_quanto.py | 85 +++++++++++++++++++++--- 4 files changed, 102 insertions(+), 10 deletions(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 0de767148abd..7780a45dd1cd 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -47,6 +47,19 @@ image = pipe( image.save("output.png") ``` +## Skipping Quantization on specific modules + +It is possible to skip applying quantization on certain modules using the `modules_to_not_convert` argument in the `QuantoConfig`. Please ensure that the modules passed in to this argument match the keys of the modules in the `state_dict` + +```python +import torch +from diffusers import FluxTransformer2DModel, QuantoConfig + +model_id = "black-forest-labs/FLUX.1-dev" +quantization_config = QuantoConfig(weights="float8", modules_to_not_convert=["proj_out"]) +transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +``` + ## Using `from_single_file` with the Quanto Backend ```python diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 71b4b889a861..effded0fe208 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -1036,11 +1036,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) named_buffers = model.named_buffers() - unexpected_keys = load_model_dict_into_meta( model, state_dict, device=param_device, + dtype=torch_dtype, model_name_or_path=pretrained_model_name_or_path, hf_quantizer=hf_quantizer, keep_in_fp32_modules=keep_in_fp32_modules, diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index 9367a9306671..c71220aa57f0 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -2,9 +2,11 @@ import torch.nn as nn -from ...utils import is_accelerate_available +from ...utils import is_accelerate_available, logging +logger = logging.get_logger(__name__) + if is_accelerate_available(): from accelerate import init_empty_weights @@ -47,5 +49,13 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): return model model = _replace_layers(model, quantization_config, modules_to_not_convert) + has_been_replaced = any(isinstance(replaced_module, QLinear) for _, replaced_module in model.named_modules()) + + if not has_been_replaced: + logger.warning( + f"{model.__class__.__name__} does not appear to have any `nn.Linear` modules. Quantization will not be applied." + " Please check your model architecture, or submit an issue on Github if you think this is a bug." + " https://github.com/huggingface/diffusers" + ) return model diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 1132e610736f..99bf689af880 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -1,3 +1,4 @@ +import tempfile import unittest import torch @@ -9,7 +10,6 @@ from diffusers.utils import is_optimum_quanto_available from diffusers.utils.testing_utils import ( nightly, - numpy_cosine_similarity_distance, require_accelerate, require_big_gpu_with_torch_cuda, torch_device, @@ -29,6 +29,7 @@ class QuantoBaseTesterMixin: torch_dtype = torch.bfloat16 expected_memory_use_in_gb = 5 keep_in_fp32_module = "" + modules_to_not_convert = "" def get_dummy_init_kwargs(self): return {"weights": "float8"} @@ -76,6 +77,22 @@ def test_keep_modules_in_fp32(self): assert module.weight.dtype == torch.float32 self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules + def test_modules_to_not_convert(self): + init_kwargs = self.get_dummy_model_init_kwargs() + + quantization_config_kwargs = self.get_dummy_init_kwargs() + quantization_config_kwargs.update({"modules_to_not_convert": self.modules_to_not_convert}) + quantization_config = QuantoConfig(**quantization_config_kwargs) + + init_kwargs.update({"quantization_config": quantization_config}) + + model = self.model_cls.from_pretrained(**init_kwargs) + model.to("cuda") + + for name, module in model.named_modules(): + if name in self.modules_to_not_convert: + assert not isinstance(module, QLinear) + def test_dtype_assignment(self): model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb @@ -99,12 +116,35 @@ def test_dtype_assignment(self): # This should work model.to("cuda") + def test_serialization(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + inputs = self.get_dummy_inputs() + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**inputs) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + saved_model = self.model_cls.from_pretrained( + tmp_dir, + torch_dtype=torch.bfloat16, + ) + + saved_model.to(torch_device) + with torch.no_grad(): + saved_model_output = saved_model(**inputs) + + max_diff = torch.abs(model_output - saved_model_output).max() + assert max_diff < 1e-5 + class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): model_id = "hf-internal-testing/tiny-flux-transformer" model_cls = FluxTransformer2DModel torch_dtype = torch.bfloat16 keep_in_fp32_module = "proj_out" + modules_to_not_convert = ["proj_out"] def get_dummy_inputs(self): return { @@ -130,14 +170,21 @@ def get_dummy_inputs(self): } -class FluxTransformerFloat8(FluxTransformerQuantoMixin, unittest.TestCase): +class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 def get_dummy_init_kwargs(self): return {"weights": "float8"} -class FluxTransformerInt8(FluxTransformerQuantoMixin, unittest.TestCase): +class FluxTransformerFloat8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): + expected_memory_use_in_gb = 10 + + def get_dummy_init_kwargs(self): + return {"weights": "float8", "activations": "float8"} + + +class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 def get_dummy_init_kwargs(self): @@ -157,20 +204,42 @@ def test_torch_compile(self): with torch.no_grad(): compiled_model_output = compiled_model(**inputs).sample - max_diff = numpy_cosine_similarity_distance( - model_output.cpu().flatten(), compiled_model_output.cpu().flatten() - ) + max_diff = torch.abs(model_output - compiled_model_output).max() + assert max_diff < 1e-4 + + +class FluxTransformerInt8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): + expected_memory_use_in_gb = 10 + + def get_dummy_init_kwargs(self): + return {"weights": "int8", "activations": "int8"} + + def test_torch_compile(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) + inputs = self.get_dummy_inputs() + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**inputs).sample + model.to("cpu") + + compiled_model.to(torch_device) + with torch.no_grad(): + compiled_model_output = compiled_model(**inputs).sample + + max_diff = torch.abs(model_output - compiled_model_output).max() assert max_diff < 1e-4 -class FluxTransformerInt4(FluxTransformerQuantoMixin, unittest.TestCase): +class FluxTransformerInt4WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 6 def get_dummy_init_kwargs(self): return {"weights": "int4"} -class FluxTransformerInt2(FluxTransformerQuantoMixin, unittest.TestCase): +class FluxTransformerInt2WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 6 def get_dummy_init_kwargs(self): From b136d239e4302f7c06b4d14fca60ac754277aca6 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 11 Feb 2025 12:36:14 +0100 Subject: [PATCH 15/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index b22832f85cb9..858b658ff269 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -103,13 +103,8 @@ def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": "int2": CustomDtype.INT2, } target_dtype = mapping[self.quantization_config.weights] - return target_dtype - else: - raise ValueError( - "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute" - " the appropriate device map, you should upgrade your `accelerate` library," - "`pip install --upgrade accelerate` or install it from source." - ) + + return target_dtype def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype": if torch_dtype is None: From 2c7f30325ddf1eb5d29bf392c4e85a81ab55bb71 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 11 Feb 2025 15:39:25 +0100 Subject: [PATCH 16/34] update --- src/diffusers/quantizers/quantization_config.py | 1 - .../quantizers/quanto/quanto_quantizer.py | 5 ++++- src/diffusers/quantizers/quanto/utils.py | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index ff7dd1e15417..c9c0e837df8c 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -698,7 +698,6 @@ def __init__( weights="int8", activations=None, modules_to_not_convert: Optional[List] = None, - compute_dtype: Optional["torch.dtype"] = None, **kwargs, ): self.quant_method = QuantizationMethod.QUANTO diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 858b658ff269..0991d3550f29 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -143,7 +143,10 @@ def _process_model_before_weight_loading( self.modules_to_not_convert.extend(keep_in_fp32_modules) model = _replace_with_quanto_layers( - model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config + model, + modules_to_not_convert=self.modules_to_not_convert, + quantization_config=self.quantization_config, + pre_quantized=self.pre_quantized, ) model.config.quantization_config = self.quantization_config diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index c71220aa57f0..891b9adf060a 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -1,5 +1,6 @@ from typing import Optional +import torch import torch.nn as nn from ...utils import is_accelerate_available, logging @@ -11,9 +12,9 @@ from accelerate import init_empty_weights -def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list): +def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list, pre_quantized=False): # Quanto imports diffusers internally. These are placed here to avoid circular imports - from optimum.quanto import QLinear, qfloat8, qint2, qint4, qint8 + from optimum.quanto import QLinear, WeightQBytesTensor, qfloat8, qint2, qint4, qint8 def _get_weight_type(dtype: str): return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] @@ -42,6 +43,16 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): weights=_get_weight_type(quantization_config.weights), activations=_get_activation_type(quantization_config.activations), ) + if pre_quantized: + qlinear.weight = WeightQBytesTensor( + qtype=_get_activation_type(quantization_config.weights), + axis=0, + size=module.weight.size(), + stride=module.weight.stride(), + activation_qtype=_get_activation_type(quantization_config.activations), + data=torch.zeros_like(module.weight), + scale=torch.nn.Parameter(torch.zeros(1)), + ) model._modules[name] = qlinear model._modules[name].source_cls = type(module) model._modules[name].requires_grad_(False) From c80d4d4a7280318d561efec5c0a92fe8a5317fb6 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 12 Feb 2025 18:56:21 +0100 Subject: [PATCH 17/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 13 +++++++++---- src/diffusers/quantizers/quanto/utils.py | 10 +++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 0991d3550f29..1e1647bdcffe 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -65,6 +65,8 @@ def check_if_quantized_param( # Quanto imports diffusers internally. This is here to prevent circular imports from optimum.quanto import QModuleMixin + if self.pre_quantized: + __import__("ipdb").set_trace() module, tensor_name = get_module_from_name(model, param_name) if isinstance(module, QModuleMixin) and "weight" in tensor_name: return not module.frozen @@ -85,10 +87,13 @@ def create_quantized_param( """ dtype = kwargs.get("dtype", torch.float32) - set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - module, _ = get_module_from_name(model, param_name) - module.freeze() - module.weight.requires_grad = False + if not self.pre_quantized: + set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) + module, _ = get_module_from_name(model, param_name) + module.freeze() + module.weight.requires_grad = False + else: + __import__("ipdb").set_trace() def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: max_memory = {key: val * 0.90 for key, val in max_memory.items()} diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index 891b9adf060a..f5cda6c6ec0e 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -14,7 +14,7 @@ def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list, pre_quantized=False): # Quanto imports diffusers internally. These are placed here to avoid circular imports - from optimum.quanto import QLinear, WeightQBytesTensor, qfloat8, qint2, qint4, qint8 + from optimum.quanto import QLinear, freeze, WeightQBytesTensor, qfloat8, qint2, qint4, qint8 def _get_weight_type(dtype: str): return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] @@ -44,6 +44,8 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): activations=_get_activation_type(quantization_config.activations), ) if pre_quantized: + print() + """ qlinear.weight = WeightQBytesTensor( qtype=_get_activation_type(quantization_config.weights), axis=0, @@ -53,6 +55,9 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): data=torch.zeros_like(module.weight), scale=torch.nn.Parameter(torch.zeros(1)), ) + """ + # qlinear.freeze() + # qlinear.weight = torch.nn.Parameter(qlinear.qweight) model._modules[name] = qlinear model._modules[name].source_cls = type(module) model._modules[name].requires_grad_(False) @@ -69,4 +74,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): " https://github.com/huggingface/diffusers" ) + if pre_quantized: + freeze(model) + return model From d355e6aa9b8d291abee7a3b847da2b1352ec1b0b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 13 Feb 2025 17:31:21 +0100 Subject: [PATCH 18/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 2 -- src/diffusers/quantizers/quanto/utils.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 1e1647bdcffe..d0cdcc61da1c 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -65,8 +65,6 @@ def check_if_quantized_param( # Quanto imports diffusers internally. This is here to prevent circular imports from optimum.quanto import QModuleMixin - if self.pre_quantized: - __import__("ipdb").set_trace() module, tensor_name = get_module_from_name(model, param_name) if isinstance(module, QModuleMixin) and "weight" in tensor_name: return not module.frozen diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index f5cda6c6ec0e..5a4aa510417a 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -74,7 +74,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): " https://github.com/huggingface/diffusers" ) - if pre_quantized: - freeze(model) + # if pre_quantized: + # freeze(model) return model From 79901e4dec613d9c17a0e58deedf53362c853694 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 18 Feb 2025 19:19:30 +0100 Subject: [PATCH 19/34] update --- src/diffusers/models/model_loading_utils.py | 4 ++ .../quantizers/quanto/quanto_quantizer.py | 31 +++++----- src/diffusers/quantizers/quanto/utils.py | 24 ++------ tests/quantization/quanto/test_quanto.py | 61 ++++++++----------- 4 files changed, 50 insertions(+), 70 deletions(-) diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 969cf16b3c5e..67a84f655405 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -225,6 +225,10 @@ def load_model_dict_into_meta( param = param.to(torch.float32) if accepts_dtype: set_module_kwargs["dtype"] = torch.float32 + + elif hf_quantizer is not None and param.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: + pass + else: param = param.to(dtype) if accepts_dtype: diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index d0cdcc61da1c..3349aca934aa 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -1,11 +1,10 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Union from ...utils import ( get_module_from_name, is_accelerate_available, is_accelerate_version, is_optimum_quanto_available, - is_optimum_quanto_version, is_torch_available, logging, ) @@ -45,10 +44,6 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)" ) - if not is_optimum_quanto_version(">=", "0.2.6"): - raise RuntimeError( - "The minimum required version of `optimum-quanto` is 0.2.6. Please upgrade with `pip install -U optimum-quanto`." - ) if not is_accelerate_available(): raise ImportError( "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" @@ -63,10 +58,13 @@ def check_if_quantized_param( **kwargs, ): # Quanto imports diffusers internally. This is here to prevent circular imports - from optimum.quanto import QModuleMixin + from optimum.quanto import QModuleMixin, QTensor + from optimum.quanto.tensor.packed import PackedTensor module, tensor_name = get_module_from_name(model, param_name) - if isinstance(module, QModuleMixin) and "weight" in tensor_name: + if self.pre_quantized and any(isinstance(module, t) for t in [QTensor, PackedTensor]): + return True + elif isinstance(module, QModuleMixin) and "weight" in tensor_name: return not module.frozen return False @@ -83,15 +81,15 @@ def create_quantized_param( """ Create the quantized parameter by calling .freeze() after setting it to the module. """ - dtype = kwargs.get("dtype", torch.float32) - if not self.pre_quantized: + dtype = kwargs.get("dtype", torch.float32) + module, tensor_name = get_module_from_name(model, param_name) + if self.pre_quantized: + setattr(module, tensor_name, param_value) + else: set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - module, _ = get_module_from_name(model, param_name) module.freeze() module.weight.requires_grad = False - else: - __import__("ipdb").set_trace() def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: max_memory = {key: val * 0.90 for key, val in max_memory.items()} @@ -161,8 +159,9 @@ def _dequantize(self, model): return @property - def is_trainable(self, model: Optional["ModelMixin"] = None): + def is_trainable(self): return True - def is_serializable(self, safe_serialization=None): - return False + @property + def is_serializable(self): + return True diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index 5a4aa510417a..e3940dffec78 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -1,6 +1,5 @@ from typing import Optional -import torch import torch.nn as nn from ...utils import is_accelerate_available, logging @@ -14,7 +13,7 @@ def _replace_with_quanto_layers(model, quantization_config, modules_to_not_convert: list, pre_quantized=False): # Quanto imports diffusers internally. These are placed here to avoid circular imports - from optimum.quanto import QLinear, freeze, WeightQBytesTensor, qfloat8, qint2, qint4, qint8 + from optimum.quanto import QLinear, freeze, qfloat8, qint2, qint4, qint8 def _get_weight_type(dtype: str): return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] @@ -43,21 +42,6 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): weights=_get_weight_type(quantization_config.weights), activations=_get_activation_type(quantization_config.activations), ) - if pre_quantized: - print() - """ - qlinear.weight = WeightQBytesTensor( - qtype=_get_activation_type(quantization_config.weights), - axis=0, - size=module.weight.size(), - stride=module.weight.stride(), - activation_qtype=_get_activation_type(quantization_config.activations), - data=torch.zeros_like(module.weight), - scale=torch.nn.Parameter(torch.zeros(1)), - ) - """ - # qlinear.freeze() - # qlinear.weight = torch.nn.Parameter(qlinear.qweight) model._modules[name] = qlinear model._modules[name].source_cls = type(module) model._modules[name].requires_grad_(False) @@ -74,7 +58,9 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): " https://github.com/huggingface/diffusers" ) - # if pre_quantized: - # freeze(model) + # We need to freeze the pre_quantized model in order for the loaded state dict and model_state dict + # to match when trying to load weights with load_model_dict_into_meta + if pre_quantized: + freeze(model) return model diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 99bf689af880..d063fae8f78f 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -30,6 +30,7 @@ class QuantoBaseTesterMixin: expected_memory_use_in_gb = 5 keep_in_fp32_module = "" modules_to_not_convert = "" + _test_torch_compile = False def get_dummy_init_kwargs(self): return {"weights": "float8"} @@ -135,9 +136,27 @@ def test_serialization(self): with torch.no_grad(): saved_model_output = saved_model(**inputs) - max_diff = torch.abs(model_output - saved_model_output).max() + max_diff = torch.abs(model_output.sample - saved_model_output.sample).max() assert max_diff < 1e-5 + def test_torch_compile(self): + if not self._test_torch_compile: + return + + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**self.get_dummy_inputs()).sample + model.to("cpu") + + compiled_model.to(torch_device) + with torch.no_grad(): + compiled_model_output = compiled_model(**self.get_dummy_inputs()).sample + + assert torch.allclose(model_output, compiled_model_output, rtol=1e-3, atol=1e-2) + class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): model_id = "hf-internal-testing/tiny-flux-transformer" @@ -145,6 +164,7 @@ class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): torch_dtype = torch.bfloat16 keep_in_fp32_module = "proj_out" modules_to_not_convert = ["proj_out"] + _test_torch_compile = False def get_dummy_inputs(self): return { @@ -172,6 +192,7 @@ def get_dummy_inputs(self): class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 + _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights": "float8"} @@ -179,6 +200,7 @@ def get_dummy_init_kwargs(self): class FluxTransformerFloat8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 + _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights": "float8", "activations": "float8"} @@ -186,54 +208,23 @@ def get_dummy_init_kwargs(self): class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 + _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights": "int8"} - def test_torch_compile(self): - model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) - inputs = self.get_dummy_inputs() - - model.to(torch_device) - with torch.no_grad(): - model_output = model(**inputs).sample - model.to("cpu") - - compiled_model.to(torch_device) - with torch.no_grad(): - compiled_model_output = compiled_model(**inputs).sample - - max_diff = torch.abs(model_output - compiled_model_output).max() - assert max_diff < 1e-4 - class FluxTransformerInt8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 10 + _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights": "int8", "activations": "int8"} - def test_torch_compile(self): - model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) - inputs = self.get_dummy_inputs() - - model.to(torch_device) - with torch.no_grad(): - model_output = model(**inputs).sample - model.to("cpu") - - compiled_model.to(torch_device) - with torch.no_grad(): - compiled_model_output = compiled_model(**inputs).sample - - max_diff = torch.abs(model_output - compiled_model_output).max() - assert max_diff < 1e-4 - class FluxTransformerInt4WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_use_in_gb = 6 + _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights": "int4"} From c4b6e24fe5536e0191ac5b4b0e2bc072afb286de Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 20 Feb 2025 05:23:49 +0100 Subject: [PATCH 20/34] update --- docs/source/en/quantization/quanto.md | 19 +- setup.py | 5 + src/diffusers/quantizers/auto.py | 4 +- .../quantizers/quantization_config.py | 16 +- src/diffusers/quantizers/quanto/utils.py | 4 +- tests/quantization/quanto/test_quanto.py | 181 ++++++++++++++---- 6 files changed, 181 insertions(+), 48 deletions(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index bacb371131eb..887c1db8c161 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -91,19 +91,30 @@ model = FluxTransformer2DModel.from_pretrained(" ## Using `torch.compile` with Quanto -Currently the Quanto backend only supports `torch.compile` for `int8` weights and activations. +Currently the Quanto backend supports `torch.compile` for the following quantization types: + +- `int8` weights ```python import torch -from diffusers import FluxTransformer2DModel, QuantoConfig +from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights="int8") -transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +transformer = FluxTransformer2DModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, +) transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True) -pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) +pipe = FluxPipeline.from_pretrained( + model_id, transformer=transformer, torch_dtype=torch_dtype +) pipe.to("cuda") +images = pipe("A cat holding a sign that says hello").images[0] +images.save("flux-quanto.png") ``` ## Supported Quantization Types diff --git a/setup.py b/setup.py index f2eabe732c8e..df6267f3997c 100644 --- a/setup.py +++ b/setup.py @@ -236,6 +236,11 @@ def run(self): ) extras["torch"] = deps_list("torch", "accelerate") +extras["bitsandbytes"] = deps_list("bitsandbytes", "accelerate") +extras["gguf"] = deps_list("gguf", "accelerate") +extras["quanto"] = deps_list("quanto", "accelerate") +extras["torchao"] = deps_list("torchao", "accelerate") + if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows else: diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index 687b29470072..ce214ae7bc17 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -37,16 +37,16 @@ "bitsandbytes_4bit": BnB4BitDiffusersQuantizer, "bitsandbytes_8bit": BnB8BitDiffusersQuantizer, "gguf": GGUFQuantizer, - "torchao": TorchAoHfQuantizer, "quanto": QuantoQuantizer, + "torchao": TorchAoHfQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { "bitsandbytes_4bit": BitsAndBytesConfig, "bitsandbytes_8bit": BitsAndBytesConfig, "gguf": GGUFQuantizationConfig, - "torchao": TorchAoConfig, "quanto": QuantoConfig, + "torchao": TorchAoConfig, } diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index c9c0e837df8c..f04feaa4794f 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -695,14 +695,14 @@ class QuantoConfig(QuantizationConfigMixin): def __init__( self, - weights="int8", - activations=None, + weights_dtype="int8", + activations_dtype=None, modules_to_not_convert: Optional[List] = None, **kwargs, ): self.quant_method = QuantizationMethod.QUANTO - self.weights = weights - self.activations = activations + self.weights_dtype = weights_dtype + self.activations_dtype = activations_dtype self.modules_to_not_convert = modules_to_not_convert self.post_init() @@ -713,8 +713,8 @@ def post_init(self): """ accepted_weights = ["float8", "int8", "int4", "int2"] accepted_activations = [None, "int8", "float8"] - if self.weights not in accepted_weights: - raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}") + if self.weights_dtype not in accepted_weights: + raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights_dtype}") - if self.activations not in accepted_activations: - raise ValueError(f"Only support weights in {accepted_activations} but found {self.activations}") + if self.activations_dtype not in accepted_activations: + raise ValueError(f"Only support weights in {accepted_activations} but found {self.activations_dtype}") diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index e3940dffec78..f590df68276c 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -39,8 +39,8 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): out_features=module.out_features, bias=module.bias is not None, dtype=module.weight.dtype, - weights=_get_weight_type(quantization_config.weights), - activations=_get_activation_type(quantization_config.activations), + weights=_get_weight_type(quantization_config.weights_dtype), + activations=_get_activation_type(quantization_config.activations_dtype), ) model._modules[name] = qlinear model._modules[name].source_cls = type(module) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index d063fae8f78f..6e4fccdba78a 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -1,39 +1,76 @@ import tempfile +import gc import unittest import torch -from diffusers import ( - QuantoConfig, -) -from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel -from diffusers.utils import is_optimum_quanto_available +from diffusers import QuantoConfig, FluxTransformer2DModel, FluxPipeline +from diffusers.utils import is_torch_available, is_optimum_quanto_available from diffusers.utils.testing_utils import ( nightly, + numpy_cosine_similarity_distance, require_accelerate, require_big_gpu_with_torch_cuda, torch_device, ) - +from diffusers.models.attention_processor import Attention if is_optimum_quanto_available(): from optimum.quanto import QLinear +if is_torch_available(): + import torch + import torch.nn as nn + + class LoRALayer(nn.Module): + """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only + + Taken from + https://github.com/huggingface/transformers/blob/566302686a71de14125717dea9a6a45b24d42b37/tests/quantization/bnb/test_4bit.py#L62C5-L78C77 + """ + + def __init__(self, module: nn.Module, rank: int): + super().__init__() + self.module = module + self.adapter = nn.Sequential( + nn.Linear(module.in_features, rank, bias=False), + nn.Linear(rank, module.out_features, bias=False), + ) + small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5 + nn.init.normal_(self.adapter[0].weight, std=small_std) + nn.init.zeros_(self.adapter[1].weight) + self.adapter.to(module.weight.device) + + def forward(self, input, *args, **kwargs): + return self.module(input, *args, **kwargs) + self.adapter(input) + @nightly @require_big_gpu_with_torch_cuda @require_accelerate class QuantoBaseTesterMixin: model_id = None + pipeline_model_id = None model_cls = None torch_dtype = torch.bfloat16 - expected_memory_use_in_gb = 5 + # the expected reduction in peak memory used compared to an unquantized model expressed as a percentage + expected_memory_reduction = 0.0 keep_in_fp32_module = "" modules_to_not_convert = "" _test_torch_compile = False + def setUp(self): + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + gc.collect() + + def tearDown(self): + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + gc.collect() + def get_dummy_init_kwargs(self): - return {"weights": "float8"} + return {"weights_dtype": "float8"} def get_dummy_model_init_kwargs(self): return { @@ -49,16 +86,20 @@ def test_quanto_layers(self): assert isinstance(module, QLinear) def test_quanto_memory_usage(self): + unquantized_model = self.model_cls.from_pretrained(self.model_id, torch_dtype=self.torch_dtype) + unquantized_model_memory = unquantized_model.get_memory_footprint() / 1024**3 + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb inputs = self.get_dummy_inputs() torch.cuda.reset_peak_memory_stats() torch.cuda.empty_cache() + + model.to(torch_device) with torch.no_grad(): model(**inputs) - max_memory = torch.cuda.max_memory_allocated() - assert (max_memory / 1024**3) < self.expected_memory_use_in_gb + max_memory = torch.cuda.max_memory_allocated() / 1024**3 + assert (1.0 - (max_memory / unquantized_model_memory)) >= self.expected_memory_reduction def test_keep_modules_in_fp32(self): r""" @@ -71,7 +112,6 @@ def test_keep_modules_in_fp32(self): model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) model.to("cuda") - assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): if name in model._keep_in_fp32_modules: @@ -96,7 +136,6 @@ def test_modules_to_not_convert(self): def test_dtype_assignment(self): model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - assert (model.get_memory_footprint() / 1024**3) < self.expected_memory_use_in_gb with self.assertRaises(ValueError): # Tries with a `dtype` @@ -136,31 +175,30 @@ def test_serialization(self): with torch.no_grad(): saved_model_output = saved_model(**inputs) - max_diff = torch.abs(model_output.sample - saved_model_output.sample).max() - assert max_diff < 1e-5 + assert torch.allclose(model_output.sample, saved_model_output.sample, rtol=1e-5, atol=1e-5) def test_torch_compile(self): if not self._test_torch_compile: return model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True) + compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True, dynamic=False) model.to(torch_device) with torch.no_grad(): model_output = model(**self.get_dummy_inputs()).sample - model.to("cpu") compiled_model.to(torch_device) with torch.no_grad(): compiled_model_output = compiled_model(**self.get_dummy_inputs()).sample - assert torch.allclose(model_output, compiled_model_output, rtol=1e-3, atol=1e-2) + assert torch.allclose(model_output, compiled_model_output, rtol=1e-2, atol=1e-3) class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): model_id = "hf-internal-testing/tiny-flux-transformer" model_cls = FluxTransformer2DModel + pipeline_cls = FluxPipeline torch_dtype = torch.bfloat16 keep_in_fp32_module = "proj_out" modules_to_not_convert = ["proj_out"] @@ -189,49 +227,128 @@ def get_dummy_inputs(self): "guidance": torch.tensor([3.5]).to(torch_device, self.torch_dtype), } + def get_dummy_training_inputs(self, device=None, seed: int = 0): + batch_size = 1 + num_latent_channels = 4 + num_image_channels = 3 + height = width = 4 + sequence_length = 48 + embedding_dim = 32 + + torch.manual_seed(seed) + hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(device, dtype=torch.bfloat16) + + torch.manual_seed(seed) + encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to( + device, dtype=torch.bfloat16 + ) + + torch.manual_seed(seed) + pooled_prompt_embeds = torch.randn((batch_size, embedding_dim)).to(device, dtype=torch.bfloat16) + + torch.manual_seed(seed) + text_ids = torch.randn((sequence_length, num_image_channels)).to(device, dtype=torch.bfloat16) + + torch.manual_seed(seed) + image_ids = torch.randn((height * width, num_image_channels)).to(device, dtype=torch.bfloat16) + + timestep = torch.tensor([1.0]).to(device, dtype=torch.bfloat16).expand(batch_size) + + return { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_prompt_embeds, + "txt_ids": text_ids, + "img_ids": image_ids, + "timestep": timestep, + } + + def test_model_cpu_offload(self): + init_kwargs = self.get_dummy_init_kwargs() + transformer = self.model_cls.from_pretrained( + "hf-internal-testing/tiny-flux-pipe", + quantization_config=QuantoConfig(**init_kwargs), + subfolder="transformer", + torch_dtype=torch.bfloat16, + ) + pipe = self.pipeline_cls.from_pretrained( + "hf-internal-testing/tiny-flux-pipe", transformer=transformer, torch_dtype=torch.bfloat16 + ) + pipe.enable_model_cpu_offload(device=torch_device) + images = pipe("a cat holding a sign that says hello", num_inference_steps=2) + + def test_training(self): + quantization_config = QuantoConfig(**self.get_dummy_init_kwargs()) + quantized_model = self.model_cls.from_pretrained( + "hf-internal-testing/tiny-flux-pipe", + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, + ).to(torch_device) + + for param in quantized_model.parameters(): + # freeze the model as only adapter layers will be trained + param.requires_grad = False + if param.ndim == 1: + param.data = param.data.to(torch.float32) + + for _, module in quantized_model.named_modules(): + if isinstance(module, Attention): + module.to_q = LoRALayer(module.to_q, rank=4) + module.to_k = LoRALayer(module.to_k, rank=4) + module.to_v = LoRALayer(module.to_v, rank=4) + + with torch.amp.autocast(str(torch_device), dtype=torch.bfloat16): + inputs = self.get_dummy_training_inputs(torch_device) + output = quantized_model(**inputs)[0] + output.norm().backward() + + for module in quantized_model.modules(): + if isinstance(module, LoRALayer): + self.assertTrue(module.adapter[1].weight.grad is not None) + self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0) + class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 10 + expected_memory_reduction = 0.3 _test_torch_compile = True def get_dummy_init_kwargs(self): - return {"weights": "float8"} + return {"weights_dtype": "float8"} class FluxTransformerFloat8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 10 - _test_torch_compile = True + expected_memory_reduction = 0.3 def get_dummy_init_kwargs(self): - return {"weights": "float8", "activations": "float8"} + return {"weights_dtype": "float8", "activations_dtype": "float8"} class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 10 + expected_memory_reduction = 0.3 _test_torch_compile = True def get_dummy_init_kwargs(self): - return {"weights": "int8"} + return {"weights_dtype": "int8"} class FluxTransformerInt8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 10 - _test_torch_compile = True + expected_memory_reduction = 0.3 def get_dummy_init_kwargs(self): - return {"weights": "int8", "activations": "int8"} + return {"weights_dtype": "int8", "activations_dtype": "int8"} class FluxTransformerInt4WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 6 + expected_memory_reduction = 0.55 _test_torch_compile = True def get_dummy_init_kwargs(self): - return {"weights": "int4"} + return {"weights_dtype": "int4"} class FluxTransformerInt2WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_use_in_gb = 6 + expected_memory_reduction = 0.65 def get_dummy_init_kwargs(self): - return {"weights": "int2"} + return {"weights_dtype": "int2"} From 6cf9a781089bc078b368e7920bff1322fd49a42f Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 20 Feb 2025 08:19:00 +0100 Subject: [PATCH 21/34] update --- docs/source/en/api/quantization.md | 5 +++++ docs/source/en/quantization/quanto.md | 10 ++++----- setup.py | 2 +- src/diffusers/models/model_loading_utils.py | 7 ++++++- .../quantizers/quanto/quanto_quantizer.py | 8 +++++++ tests/quantization/quanto/test_quanto.py | 21 ++++++++++--------- 6 files changed, 36 insertions(+), 17 deletions(-) diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md index 168a9a03473f..2c728cff3c07 100644 --- a/docs/source/en/api/quantization.md +++ b/docs/source/en/api/quantization.md @@ -31,6 +31,11 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui ## GGUFQuantizationConfig [[autodoc]] GGUFQuantizationConfig + +## QuantoConfig + +[[autodoc]] QuantoConfig + ## TorchAoConfig [[autodoc]] TorchAoConfig diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 887c1db8c161..306dcc2eb12c 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -33,7 +33,7 @@ import torch from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" -quantization_config = QuantoConfig(weights="float8") +quantization_config = QuantoConfig(weights_dtype="float8") transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) @@ -55,7 +55,7 @@ import torch from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" -quantization_config = QuantoConfig(weights="float8", modules_to_not_convert=["proj_out"]) +quantization_config = QuantoConfig(weights_dtype="float8", modules_to_not_convert=["proj_out"]) transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) ``` @@ -66,7 +66,7 @@ import torch from diffusers import FluxTransformer2DModel, QuantoConfig ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors" -quantization_config = QuantoConfig(weights="float8") +quantization_config = QuantoConfig(weights_dtype="float8") transformer = FluxTransformer2DModel.from_single_file(ckpt_path, quantization_config=quantization_config, torch_dtype=torch.bfloat16) ``` @@ -79,7 +79,7 @@ import torch from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" -quantization_config = QuantoConfig(weights="float8") +quantization_config = QuantoConfig(weights_dtype="float8") transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) # save quantized model to reuse @@ -100,7 +100,7 @@ import torch from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" -quantization_config = QuantoConfig(weights="int8") +quantization_config = QuantoConfig(weights_dtype="int8") transformer = FluxTransformer2DModel.from_pretrained( model_id, subfolder="transformer", diff --git a/setup.py b/setup.py index 5646356b0f47..b8ed229a692d 100644 --- a/setup.py +++ b/setup.py @@ -241,7 +241,7 @@ def run(self): extras["bitsandbytes"] = deps_list("bitsandbytes", "accelerate") extras["gguf"] = deps_list("gguf", "accelerate") -extras["quanto"] = deps_list("quanto", "accelerate") +extras["quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") if os.name == "nt": # windows diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index 9c838ac61476..f06a5e5e7a9c 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -259,6 +259,9 @@ def load_model_dict_into_meta( ): param = param.to(torch.float32) set_module_kwargs["dtype"] = torch.float32 + # For quantizers have save weights using torch.float8_e4m3fn + elif hf_quantizer is not None and param.dtype == getattr(torch, "float8_e4m3fn", None): + pass else: param = param.to(dtype) set_module_kwargs["dtype"] = dtype @@ -306,7 +309,9 @@ def load_model_dict_into_meta( elif is_quantized and ( hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=param_device) ): - hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys) + hf_quantizer.create_quantized_param( + model, param, param_name, param_device, state_dict, unexpected_keys, dtype=dtype + ) else: set_module_tensor_to_device(model, param_name, param_device, value=param, **set_module_kwargs) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 3349aca934aa..2fd043c8d96f 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -1,5 +1,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Union +from diffusers.utils.import_utils import is_optimum_quanto_version + from ...utils import ( get_module_from_name, is_accelerate_available, @@ -44,6 +46,12 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)" ) + if not is_optimum_quanto_version(">=", "0.2.6"): + raise ImportError( + "Loading an optimum-quanto quantized model requires `optimum-quanto>=0.2.6`. " + "Please upgrade your installation with `pip install --upgrade optimum-quanto" + ) + if not is_accelerate_available(): raise ImportError( "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 6e4fccdba78a..5bce8072949a 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -1,11 +1,10 @@ -import tempfile import gc +import tempfile import unittest -import torch - -from diffusers import QuantoConfig, FluxTransformer2DModel, FluxPipeline -from diffusers.utils import is_torch_available, is_optimum_quanto_available +from diffusers import FluxPipeline, FluxTransformer2DModel, QuantoConfig +from diffusers.models.attention_processor import Attention +from diffusers.utils import is_optimum_quanto_available, is_torch_available from diffusers.utils.testing_utils import ( nightly, numpy_cosine_similarity_distance, @@ -13,7 +12,7 @@ require_big_gpu_with_torch_cuda, torch_device, ) -from diffusers.models.attention_processor import Attention + if is_optimum_quanto_available(): from optimum.quanto import QLinear @@ -192,7 +191,11 @@ def test_torch_compile(self): with torch.no_grad(): compiled_model_output = compiled_model(**self.get_dummy_inputs()).sample - assert torch.allclose(model_output, compiled_model_output, rtol=1e-2, atol=1e-3) + model_output = model_output.detach().float().cpu().numpy() + compiled_model_output = compiled_model_output.detach().float().cpu().numpy() + + max_diff = numpy_cosine_similarity_distance(model_output.flatten(), compiled_model_output.flatten()) + assert max_diff < 1e-3 class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): @@ -275,7 +278,7 @@ def test_model_cpu_offload(self): "hf-internal-testing/tiny-flux-pipe", transformer=transformer, torch_dtype=torch.bfloat16 ) pipe.enable_model_cpu_offload(device=torch_device) - images = pipe("a cat holding a sign that says hello", num_inference_steps=2) + _ = pipe("a cat holding a sign that says hello", num_inference_steps=2) def test_training(self): quantization_config = QuantoConfig(**self.get_dummy_init_kwargs()) @@ -311,7 +314,6 @@ def test_training(self): class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_reduction = 0.3 - _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights_dtype": "float8"} @@ -341,7 +343,6 @@ def get_dummy_init_kwargs(self): class FluxTransformerInt4WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_reduction = 0.55 - _test_torch_compile = True def get_dummy_init_kwargs(self): return {"weights_dtype": "int4"} From 0736f87d1f5ddf34086ca3b6ff941fce55b276ec Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 20 Feb 2025 09:03:50 +0100 Subject: [PATCH 22/34] update --- tests/quantization/quanto/test_quanto.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 5bce8072949a..952280a7b487 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -309,7 +309,6 @@ def test_training(self): for module in quantized_model.modules(): if isinstance(module, LoRALayer): self.assertTrue(module.adapter[1].weight.grad is not None) - self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0) class FluxTransformerFloat8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): From 4eabed7f976176b271662b209075edfbc0117ab7 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 25 Feb 2025 05:12:25 +0100 Subject: [PATCH 23/34] update --- docs/source/en/quantization/quanto.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 306dcc2eb12c..729093ea9c11 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -26,7 +26,7 @@ In order to use the Quanto backend, you will first need to install `optimum-quan pip install optimum-quanto accelerate ``` -Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. The following snippet demonstrates how to apply `float8` quantization with Quanto. +Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. Although the Quanto library does allow quantizing `nn.Conv2d` and `nn.LayerNorm` modules, currently, Diffusers only supports quantizing the `nn.Linear` layers in a model. The following snippet demonstrates how to apply `float8` quantization with Quanto. ```python import torch From f512c2893fca5b54aec87a7dd9c27c05e8180d69 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 25 Feb 2025 11:52:48 +0100 Subject: [PATCH 24/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 2fd043c8d96f..5778773f8940 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -57,6 +57,12 @@ def validate_environment(self, *args, **kwargs): "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)" ) + device_map = kwargs.get("device_map", None) + if isinstance(device_map, dict) and len(device_map.keys()) > 1: + raise ValueError( + "`device_map` for multi-GPU inference or CPU/disk offload is currently not supported with the Quanto backend" + ) + def check_if_quantized_param( self, model: "ModelMixin", @@ -104,14 +110,14 @@ def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, return max_memory def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": - if is_accelerate_version(">=0.27.0"): + if is_accelerate_version(">=", "0.27.0"): mapping = { "int8": torch.int8, "float8": CustomDtype.FP8, "int4": CustomDtype.INT4, "int2": CustomDtype.INT2, } - target_dtype = mapping[self.quantization_config.weights] + target_dtype = mapping[self.quantization_config.weights_dtype] return target_dtype From dbaef7c3a421d45e49377784316f65394c8895c3 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 25 Feb 2025 13:02:26 +0100 Subject: [PATCH 25/34] update --- tests/quantization/quanto/test_quanto.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 952280a7b487..f0f78988b875 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -197,6 +197,12 @@ def test_torch_compile(self): max_diff = numpy_cosine_similarity_distance(model_output.flatten(), compiled_model_output.flatten()) assert max_diff < 1e-3 + def test_device_map_error(self): + with self.assertRaises(ValueError): + model = self.model_cls.from_pretrained( + **self.get_dummy_model_init_kwargs(), device_map={0: "8GB", "cpu": "16GB"} + ) + class FluxTransformerQuantoMixin(QuantoBaseTesterMixin): model_id = "hf-internal-testing/tiny-flux-transformer" From 963559f69bed66443702a3ac6cf804b52bba1ce8 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 25 Feb 2025 13:14:59 +0100 Subject: [PATCH 26/34] update --- tests/quantization/quanto/test_quanto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index f0f78988b875..6e2e795bb033 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -199,7 +199,7 @@ def test_torch_compile(self): def test_device_map_error(self): with self.assertRaises(ValueError): - model = self.model_cls.from_pretrained( + _ = self.model_cls.from_pretrained( **self.get_dummy_model_init_kwargs(), device_map={0: "8GB", "cpu": "16GB"} ) From 4516f2238bf1ee87687532a25be3c89a2cb87457 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 03:51:28 +0100 Subject: [PATCH 27/34] update --- docs/source/en/quantization/quanto.md | 15 +++++++++------ src/diffusers/quantizers/quantization_config.py | 16 ++++------------ .../quantizers/quanto/quanto_quantizer.py | 2 +- src/diffusers/quantizers/quanto/utils.py | 8 +------- tests/quantization/quanto/test_quanto.py | 14 -------------- 5 files changed, 15 insertions(+), 40 deletions(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 729093ea9c11..75a22c13b8b3 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -26,7 +26,7 @@ In order to use the Quanto backend, you will first need to install `optimum-quan pip install optimum-quanto accelerate ``` -Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. Although the Quanto library does allow quantizing `nn.Conv2d` and `nn.LayerNorm` modules, currently, Diffusers only supports quantizing the `nn.Linear` layers in a model. The following snippet demonstrates how to apply `float8` quantization with Quanto. +Now you can quantize a model by passing the `QuantoConfig` object to the `from_pretrained()` method. Although the Quanto library does allow quantizing `nn.Conv2d` and `nn.LayerNorm` modules, currently, Diffusers only supports quantizing the weights in the `nn.Linear` layers of a model. The following snippet demonstrates how to apply `float8` quantization with Quanto. ```python import torch @@ -61,6 +61,8 @@ transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_conf ## Using `from_single_file` with the Quanto Backend +`QuantoConfig` is compatible with `~FromOriginalModelMixin.from_single_file`. + ```python import torch from diffusers import FluxTransformer2DModel, QuantoConfig @@ -72,7 +74,10 @@ transformer = FluxTransformer2DModel.from_single_file(ckpt_path, quantization_co ## Saving Quantized models -Diffusers supports serializing and saving Quanto models using the `save_pretrained` method. +Diffusers supports serializing Quanto models using the `~ModelMixin.save_pretrained` method. + +The serialization and loading requirements are different for models quantized directly with the Quanto library and models quantized +with Diffusers using Quanto as the backend. It is currently not possible to load models quantized directly with Quanto into Diffusers using `~ModelMixin.from_pretrained` ```python import torch @@ -114,7 +119,7 @@ pipe = FluxPipeline.from_pretrained( ) pipe.to("cuda") images = pipe("A cat holding a sign that says hello").images[0] -images.save("flux-quanto.png") +images.save("flux-quanto-compile.png") ``` ## Supported Quantization Types @@ -126,6 +131,4 @@ images.save("flux-quanto.png") - int4 - int2 -### Activations -- float8 -- int8 + diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index f04feaa4794f..50e18af80abf 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -684,25 +684,21 @@ class QuantoConfig(QuantizationConfigMixin): loaded using `quanto`. Args: - weights (`str`, *optional*, defaults to `"int8"`): + weights_dtype (`str`, *optional*, defaults to `"int8"`): The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2") - activations (`str`, *optional*): - The target dtype for the activations after quantization. Supported values are (None,"int8","float8") - modules_to_not_convert (`list`, *optional*, default to `None`): + modules_to_not_convert (`list`, *optional*, default to `None`): The list of modules to not quantize, useful for quantizing models that explicitly require to have some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). """ def __init__( self, - weights_dtype="int8", - activations_dtype=None, - modules_to_not_convert: Optional[List] = None, + weights_dtype: str = "int8", + modules_to_not_convert: Optional[List[str]] = None, **kwargs, ): self.quant_method = QuantizationMethod.QUANTO self.weights_dtype = weights_dtype - self.activations_dtype = activations_dtype self.modules_to_not_convert = modules_to_not_convert self.post_init() @@ -712,9 +708,5 @@ def post_init(self): Safety checker that arguments are correct """ accepted_weights = ["float8", "int8", "int4", "int2"] - accepted_activations = [None, "int8", "float8"] if self.weights_dtype not in accepted_weights: raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights_dtype}") - - if self.activations_dtype not in accepted_activations: - raise ValueError(f"Only support weights in {accepted_activations} but found {self.activations_dtype}") diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 5778773f8940..231b2fa2ff99 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -60,7 +60,7 @@ def validate_environment(self, *args, **kwargs): device_map = kwargs.get("device_map", None) if isinstance(device_map, dict) and len(device_map.keys()) > 1: raise ValueError( - "`device_map` for multi-GPU inference or CPU/disk offload is currently not supported with the Quanto backend" + "`device_map` for multi-GPU inference or CPU/disk offload is currently not supported with Diffusers and the Quanto backend" ) def check_if_quantized_param( diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index f590df68276c..f14818ab3cd8 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -1,5 +1,3 @@ -from typing import Optional - import torch.nn as nn from ...utils import is_accelerate_available, logging @@ -18,9 +16,6 @@ def _replace_with_quanto_layers(model, quantization_config, modules_to_not_conve def _get_weight_type(dtype: str): return {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}[dtype] - def _get_activation_type(dtype: Optional[str]): - return {None: None, "float8": qfloat8, "int8": qint8}[dtype] - def _replace_layers(model, quantization_config, modules_to_not_convert): has_children = list(model.children()) if not has_children: @@ -40,7 +35,6 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): bias=module.bias is not None, dtype=module.weight.dtype, weights=_get_weight_type(quantization_config.weights_dtype), - activations=_get_activation_type(quantization_config.activations_dtype), ) model._modules[name] = qlinear model._modules[name].source_cls = type(module) @@ -58,7 +52,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): " https://github.com/huggingface/diffusers" ) - # We need to freeze the pre_quantized model in order for the loaded state dict and model_state dict + # We need to freeze the pre_quantized model in order for the loaded state_dict and model state dict # to match when trying to load weights with load_model_dict_into_meta if pre_quantized: freeze(model) diff --git a/tests/quantization/quanto/test_quanto.py b/tests/quantization/quanto/test_quanto.py index 6e2e795bb033..89a56c15ed24 100644 --- a/tests/quantization/quanto/test_quanto.py +++ b/tests/quantization/quanto/test_quanto.py @@ -324,13 +324,6 @@ def get_dummy_init_kwargs(self): return {"weights_dtype": "float8"} -class FluxTransformerFloat8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_reduction = 0.3 - - def get_dummy_init_kwargs(self): - return {"weights_dtype": "float8", "activations_dtype": "float8"} - - class FluxTransformerInt8WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_reduction = 0.3 _test_torch_compile = True @@ -339,13 +332,6 @@ def get_dummy_init_kwargs(self): return {"weights_dtype": "int8"} -class FluxTransformerInt8WeightsAndActivationTest(FluxTransformerQuantoMixin, unittest.TestCase): - expected_memory_reduction = 0.3 - - def get_dummy_init_kwargs(self): - return {"weights_dtype": "int8", "activations_dtype": "int8"} - - class FluxTransformerInt4WeightsTest(FluxTransformerQuantoMixin, unittest.TestCase): expected_memory_reduction = 0.55 From 830b7345b75c6aae18bd00cf8651b98a77489f3b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 04:09:04 +0100 Subject: [PATCH 28/34] update --- .github/workflows/nightly_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index a40be8558499..70dcf0a5f9cb 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -418,6 +418,8 @@ jobs: test_location: "gguf" - backend: "torchao" test_location: "torchao" + - backend: "optimum_quanto" + test_location: "quanto" runs-on: group: aws-g6e-xlarge-plus container: From 8163687e08ea435d5ea9515edbfa4b1571c7cbf9 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 04:21:46 +0100 Subject: [PATCH 29/34] update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8ed229a692d..fdc166a81ecf 100644 --- a/setup.py +++ b/setup.py @@ -241,7 +241,7 @@ def run(self): extras["bitsandbytes"] = deps_list("bitsandbytes", "accelerate") extras["gguf"] = deps_list("gguf", "accelerate") -extras["quanto"] = deps_list("optimum_quanto", "accelerate") +extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") if os.name == "nt": # windows From bb7fb66b4dafe8a5b2f8a2fe7765137acfe635d8 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 17:15:20 +0100 Subject: [PATCH 30/34] update --- docs/source/en/quantization/quanto.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 75a22c13b8b3..e2ea730ba7b8 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -85,8 +85,12 @@ from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights_dtype="float8") -transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) - +transformer = FluxTransformer2DModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, +) # save quantized model to reuse transformer.save_pretrained("") From 6cad1d537a4a70694f1dce891bcf5ea916cd72ae Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 17:22:33 +0100 Subject: [PATCH 31/34] update --- docs/source/en/quantization/quanto.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index e2ea730ba7b8..025c9cf91805 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -56,7 +56,12 @@ from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights_dtype="float8", modules_to_not_convert=["proj_out"]) -transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +transformer = FluxTransformer2DModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, +) ``` ## Using `from_single_file` with the Quanto Backend From d5ab9cadc093d65a62e4f7fc1ca97259b44bfb1d Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 21:53:05 +0530 Subject: [PATCH 32/34] Update src/diffusers/quantizers/quanto/utils.py Co-authored-by: Sayak Paul --- src/diffusers/quantizers/quanto/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/quanto/utils.py b/src/diffusers/quantizers/quanto/utils.py index f14818ab3cd8..6f41fd36b43a 100644 --- a/src/diffusers/quantizers/quanto/utils.py +++ b/src/diffusers/quantizers/quanto/utils.py @@ -49,7 +49,7 @@ def _replace_layers(model, quantization_config, modules_to_not_convert): logger.warning( f"{model.__class__.__name__} does not appear to have any `nn.Linear` modules. Quantization will not be applied." " Please check your model architecture, or submit an issue on Github if you think this is a bug." - " https://github.com/huggingface/diffusers" + " https://github.com/huggingface/diffusers/issues/new" ) # We need to freeze the pre_quantized model in order for the loaded state_dict and model state dict From deebc22ebd1e6b164d07ac29f47898f030cc328b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 7 Mar 2025 17:27:19 +0100 Subject: [PATCH 33/34] update --- src/diffusers/quantizers/quanto/quanto_quantizer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/diffusers/quantizers/quanto/quanto_quantizer.py b/src/diffusers/quantizers/quanto/quanto_quantizer.py index 231b2fa2ff99..0120163804c9 100644 --- a/src/diffusers/quantizers/quanto/quanto_quantizer.py +++ b/src/diffusers/quantizers/quanto/quanto_quantizer.py @@ -168,10 +168,6 @@ def _process_model_before_weight_loading( def _process_model_after_weight_loading(self, model, **kwargs): return model - def _dequantize(self, model): - logger.warning("Dequantizing the full model is currently not supported with the Quanto backend") - return - @property def is_trainable(self): return True From 1b46a32f2aa18474236ccad4a11f5db80c85b0ca Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 10 Mar 2025 03:23:56 +0100 Subject: [PATCH 34/34] update --- docs/source/en/quantization/quanto.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 025c9cf91805..d322d76be267 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -34,7 +34,12 @@ from diffusers import FluxTransformer2DModel, QuantoConfig model_id = "black-forest-labs/FLUX.1-dev" quantization_config = QuantoConfig(weights_dtype="float8") -transformer = FluxTransformer2DModel.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.bfloat16) +transformer = FluxTransformer2DModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, +) pipe = FluxPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch_dtype) pipe.to("cuda")