add docs.

sayakpaul · sayakpaul · commit 872c91ef66d3 · 2025-05-02T11:21:19.000+05:30
diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
@@ -23,6 +23,9 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui
 
 </Tip>
 
+## PipelineQuantizationConfig
+
+[[autodoc]] PipelineQuantizationConfig
 
 ## BitsAndBytesConfig
 
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -39,3 +39,62 @@ Diffusers currently supports the following quantization methods.
 - [Quanto](./quanto.md)
 
 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
+
+## Pipeline-level quantization
+
+Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
+quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
+do this with [`PipelineQuantizationConfig`].
+
+Start by defining a `PipelineQuantizationConfig`:
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.quantizers.quantization_config import QuantoConfig
+from diffusers.quantizers import PipelineQuantizationConfig
+from transformers import BitsAndBytesConfig
+
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={
+        "transformer": QuantoConfig(weights_dtype="int8"),
+        "text_encoder_2": BitsAndBytesConfig(
+            load_in_4bit=True, compute_dtype=torch.bfloat16
+        ),
+    }
+)
+```
+
+Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
+
+```py
+pipe = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantization_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+image = pipe("photo of a cute dog").images[0]
+```
+
+This method allows for more granular control over the quantization specifications of individual 
+model-level components of a pipeline. It also allows for different quantization backends for
+different components. In the above example, you used a combination of Quanto and BitsandBytes.
+
+The other method is simpler in terms of experience but is
+less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
+
+```py
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_backend="bitsandbytes_4bit",
+    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+    components_to_quantize=["transformer", "text_encoder_2"],
+)
+```
+
+This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
+
+In this case, `quant_kwargs` will be used to initialize the quantization specifications
+of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
+is used to denote the components that will be quantized. For most pipelines, you would want to
+keep `transformer` in the list as that is often the most compute and memory intensive.
diff --git a/src/diffusers/quantizers/__init__.py b/src/diffusers/quantizers/__init__.py
@@ -33,7 +33,49 @@ class TransformersQuantConfigMixin:
 
 
 class PipelineQuantizationConfig:
-    """TODO"""
+    """
+    Configuration class to be used when applying quantization on-the-fly to [`~DiffusionPipeline.from_pretrained`].
+
+    Args:
+        quant_backend (`str`): Quantization backend to be used. When using this option, we assume that the backend
+            is available to both `diffusers` and `transformers`.
+        quant_kwargs (`dict`): Params to initialize the quantization backend class.
+        components_to_quantize (`list`): Components of a pipeline to be quantized.
+        quant_mapping (`dict`): Mapping defining the quantization specs to be used for the pipeline
+            components. When using this argument, users are not expected to provide `quant_backend`, `quant_kawargs`,
+            and `components_to_quantize`.
+
+    Examples:
+
+    When using with `quant_backend`:
+
+    >>> import torch >>> from diffusers import DiffusionPipeline >>> from diffusers.quantizers import
+    PipelineQuantizationConfig
+
+    >>> pipeline_quant_config = PipelineQuantizationConfig( ... quant_backend="bitsandbytes_4bit", ... quant_kwargs={
+    ... "load_in_4bit": True, ... "bnb_4bit_quant_type": "nf4", ... "bnb_4bit_compute_dtype": torch.bfloat16, ... },
+    ... components_to_quantize=["transformer", "text_encoder_2"], ... )
+
+    >>> pipe = DiffusionPipeline.from_pretrained( ... "black-forest-labs/FLUX.1-dev", ...
+    quantization_config=pipeline_quant_config, ... torch_dtype=torch.bfloat16, ... ).to("cuda")
+
+    >>> image = pipe("photo of a cute dog").images[0]
+
+    When using with `quant_mapping`:
+
+    >>> import torch >>> from diffusers import DiffusionPipeline >>> from diffusers.quantizers.quantization_config
+    import QuantoConfig >>> from diffusers.quantizers import PipelineQuantizationConfig >>> from transformers import
+    BitsAndBytesConfig
+
+    >>> pipeline_quant_config = PipelineQuantizationConfig( ... quant_mapping={ ... "transformer":
+    QuantoConfig(weights_dtype="int8"), ... "text_encoder_2": BitsAndBytesConfig( ... load_in_4bit=True,
+    compute_dtype=torch.bfloat16 ... ), ... } ... )
+
+    >>> pipe = DiffusionPipeline.from_pretrained( ... "black-forest-labs/FLUX.1-dev", ...
+    quantization_config=pipeline_quant_config, ... torch_dtype=torch.bfloat16, ... ).to("cuda")
+
+    >>> image = pipe("photo of a cute dog").images[0]
+    """
 
     def __init__(
         self,
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -75,7 +75,7 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
         Args:
             config_dict (`Dict[str, Any]`):
                 Dictionary that will be used to instantiate the configuration object.
-            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
                 `PreTrainedModel`.
             kwargs (`Dict[str, Any]`):