From a28d7634ec7369eccc01900567650ddedce7f620 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 15 Aug 2023 16:36:10 -0700 Subject: [PATCH 1/6] [pt2e][quant] Update some docs for pt2 export quantization Summary: . Test Plan: CI generated docs Reviewers: Subscribers: Tasks: Tags: --- prototype_source/pt2e_quant_ptq_static.rst | 49 +++++++++++++++------- prototype_source/pt2e_quantizer.rst | 30 ++++++++++++- 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index 4e7a7ea82fa..0f756fb3e36 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -430,23 +430,42 @@ Convert the Calibrated Model to a Quantized Model print(quantized_model) .. note:: - the model produced here also had some improvement upon the previous - `representations `_ in the FX graph mode quantizaiton, previously all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``, in the new flow, we choose to represent some of the operators with integer computation so that it's closer to the computation happens in hardwares. - For example, here is how we plan to represent a quantized linear operator: + At this step, we currently have two representations that you can choose from, but what exact representation + we offer in the long term might change based on feedbacks from users. - .. code-block:: python + * Q/DQ Representation (default) + Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_int32, bias_scale, bias_zero_point, output_scale, output_zero_point): - x_int16 = x_int8.to(torch.int16) - weight_int16 = weight_int8.to(torch.int16) - acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) - acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) - bias_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, bias_int32 - bias_zero_point, bias_scale / output_scale)) - out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8) - return out_int8 - - For more details, please see: - `Quantized Model Representation `_. + .. code-block:: python + def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) + weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8) + weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]); + out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) + return out_i8 + + * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (e.g. quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators. + + You can get this representation by: convert_pt2e(..., use_reference_representation=True) + + .. code-block:: python + # Reference Quantized Pattern for quantized linear + def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): + x_int16 = x_int8.to(torch.int16) + weight_int16 = weight_int8.to(torch.int16) + acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) + acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) + bias_scale = x_scale * weight_scale + bias_int32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, bias_fp32, bias_scale / out_scale) + out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8) + return out_int8 + + + Please see ``_ for the most up to date reference representations. Checking Model Size and Accuracy Evaluation diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst index c4dcce4116a..4c57d70f7ce 100644 --- a/prototype_source/pt2e_quantizer.rst +++ b/prototype_source/pt2e_quantizer.rst @@ -9,6 +9,7 @@ Prerequisites: ^^^^^^^^^^^^^^^^ Required: + - `Torchdynamo concepts in PyTorch `__ - `Quantization concepts in PyTorch `__ @@ -16,6 +17,7 @@ Required: - `(prototype) PyTorch 2.0 Export Post Training Static Quantization `__ Optional: + - `FX Graph Mode post training static quantization `__ - `BackendConfig in PyTorch Quantization FX Graph Mode `__ @@ -141,7 +143,33 @@ parameters can be shared among some tensors explicitly. Two typical use cases ar ``SharedQuantizationSpec`` is designed for this use case to annotate tensors whose quantization parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is an ``EdgeOrNode`` object which -can be an input edge or an output value. +can be an input edge or an output value. + +.. note:: + * Sharing is Transitive + Some Tensors might be effectively be using shared quantization spec due to (1) two nodes/edges are + configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes + + For example, let's say we have two conv nodes conv1 and conv2, and both of them are fed into a cat + node. `cat([conv1_out, conv2_out], ...)` Let's say output of conv1, conv2 and first input of cat are configured + with the same configurations of QuantizationSpec, second input of cat is configured to use SharedQuantizationSpec + with the first input. + conv1_out: qspec1(dtype=torch.int8, ...) + conv2_out: qspec1(dtype=torch.int8, ...) + cat_input0: qspec1(dtype=torch.int8, ...) + cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat + + First of all, the output of conv1 are implicitly sharing quantization parameter (and observer object) + with first input of cat, and same for output of conv2 and second input of cat. + So since user configures the two input of cat to share quantization parameters, by transitivity, + conv2_out and conv1_out will also be sharing quantization parameters. In the observed graph, you + will see: + ``` + conv1 -> obs -> cat + conv2 -> obs / + ``` + and both `obs` will be the same observer instance + - Input edge is the connection between input node and the node consuming the input, so it's a ``Tuple[Node, Node]``. From 93aedaa480ab1c6b755551064f90860a7caa981a Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 17 Aug 2023 17:30:36 -0700 Subject: [PATCH 2/6] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- prototype_source/pt2e_quant_ptq_static.rst | 7 ++-- prototype_source/pt2e_quantizer.rst | 37 ++++++++++++---------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index 0f756fb3e36..a358b5361f6 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -437,6 +437,7 @@ Convert the Calibrated Model to a Quantized Model Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. .. code-block:: python + def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) @@ -448,9 +449,9 @@ Convert the Calibrated Model to a Quantized Model out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) return out_i8 - * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (e.g. quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators. + * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators. - You can get this representation by: convert_pt2e(..., use_reference_representation=True) + You can get this representation by: ``convert_pt2e(..., use_reference_representation=True)`` .. code-block:: python # Reference Quantized Pattern for quantized linear @@ -465,7 +466,7 @@ Convert the Calibrated Model to a Quantized Model return out_int8 - Please see ``_ for the most up to date reference representations. + See `here `_ for the most up-to-date reference representations. Checking Model Size and Accuracy Evaluation diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst index 4c57d70f7ce..484ad6c94d8 100644 --- a/prototype_source/pt2e_quantizer.rst +++ b/prototype_source/pt2e_quantizer.rst @@ -146,29 +146,34 @@ parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is can be an input edge or an output value. .. note:: - * Sharing is Transitive - Some Tensors might be effectively be using shared quantization spec due to (1) two nodes/edges are + * Sharing is transitive + + Some Tensors might be effectively using shared quantization spec due to (1) two nodes/edges are configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes - For example, let's say we have two conv nodes conv1 and conv2, and both of them are fed into a cat - node. `cat([conv1_out, conv2_out], ...)` Let's say output of conv1, conv2 and first input of cat are configured - with the same configurations of QuantizationSpec, second input of cat is configured to use SharedQuantizationSpec + For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat`` + node. `cat([conv1_out, conv2_out], ...)` Let's say output of ``conv1``, ``conv2`` and the first input of ``cat`` are configured + with the same configurations of ``QuantizationSpec``, second input of ``cat`` is configured to use ``SharedQuantizationSpec`` with the first input. - conv1_out: qspec1(dtype=torch.int8, ...) - conv2_out: qspec1(dtype=torch.int8, ...) - cat_input0: qspec1(dtype=torch.int8, ...) - cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat - First of all, the output of conv1 are implicitly sharing quantization parameter (and observer object) - with first input of cat, and same for output of conv2 and second input of cat. - So since user configures the two input of cat to share quantization parameters, by transitivity, - conv2_out and conv1_out will also be sharing quantization parameters. In the observed graph, you + .. code-block:: + + conv1_out: qspec1(dtype=torch.int8, ...) + conv2_out: qspec1(dtype=torch.int8, ...) + cat_input0: qspec1(dtype=torch.int8, ...) + cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat + + First of all, the output of ``conv1`` is implicitly sharing quantization parameter (and observer object) + with the first input of ``cat``, and same for output of ``conv2`` and the second input of ``cat``. + So since user configures the two inputs of ``cat`` to share quantization parameters, by transitivity, + ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you will see: - ``` + .. code-block:: + conv1 -> obs -> cat conv2 -> obs / - ``` - and both `obs` will be the same observer instance + + and both ``obs`` will be the same observer instance. - Input edge is the connection between input node and the node consuming the input, From 7e504c1185f68e4bdb18ad8816a6481f7ed5e308 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Aug 2023 14:21:06 -0700 Subject: [PATCH 3/6] Update capture API --- prototype_source/pt2e_quant_ptq_static.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index a358b5361f6..d1b483f3166 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -53,6 +53,7 @@ The PyTorch 2.0 export quantization API looks like this: .. code:: python import torch + from torch._export import capture_pre_autograd_graph class M(torch.nn.Module): def __init__(self): super().__init__() @@ -66,7 +67,9 @@ The PyTorch 2.0 export quantization API looks like this: m = M().eval() # Step 1. program capture - m = torch._dynamo.export(m, *example_inputs, aten_graph=True) + # NOTE: this API will be updated to torch.export API in the future, but the captured + # result shoud mostly stay the same + m = capture_pre_autograd_graph(m, *example_inputs) # we get a model with aten ops @@ -352,10 +355,13 @@ Here is how you can use ``torch.export`` to export the model: .. code-block:: python - import torch._dynamo as torchdynamo + from torch._export import capture_pre_autograd_graph example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model, _ = torchdynamo.export(model_to_quantize, *example_inputs, aten_graph=True, tracing_mode="symbolic") + exported_model, _ = capture_pre_autograd_graph(model_to_quantize, *example_inputs) + + +``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. Import the Backend Specific Quantizer and Configure how to Quantize the Model From ae209ccb01583cfc8477e103b4daaeadcb7ef286 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Aug 2023 15:41:32 -0700 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- prototype_source/pt2e_quant_ptq_static.rst | 13 ++++++----- prototype_source/pt2e_quantizer.rst | 26 +++++++++++++--------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index d1b483f3166..3736a5b0766 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -436,11 +436,13 @@ Convert the Calibrated Model to a Quantized Model print(quantized_model) .. note:: - At this step, we currently have two representations that you can choose from, but what exact representation - we offer in the long term might change based on feedbacks from users. + At this step, we currently have two representations that you can choose from, but exact representation + we offer in the long term might change based on feedback from PyTorch users. * Q/DQ Representation (default) - Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. + + Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. .. code-block:: python @@ -455,11 +457,12 @@ Convert the Calibrated Model to a Quantized Model out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) return out_i8 - * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators. + * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators. - You can get this representation by: ``convert_pt2e(..., use_reference_representation=True)`` + You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. .. code-block:: python + # Reference Quantized Pattern for quantized linear def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): x_int16 = x_int8.to(torch.int16) diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst index 484ad6c94d8..c8cc18dba8f 100644 --- a/prototype_source/pt2e_quantizer.rst +++ b/prototype_source/pt2e_quantizer.rst @@ -146,14 +146,17 @@ parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is can be an input edge or an output value. .. note:: - * Sharing is transitive - Some Tensors might be effectively using shared quantization spec due to (1) two nodes/edges are - configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes + * Sharing is transitive + Some tensors might be effectively using shared quantization spec due to: + + * Two nodes/edges are configured to use ``SharedQuantizationSpec``. + * There is existing sharing of some nodes. + For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat`` - node. `cat([conv1_out, conv2_out], ...)` Let's say output of ``conv1``, ``conv2`` and the first input of ``cat`` are configured - with the same configurations of ``QuantizationSpec``, second input of ``cat`` is configured to use ``SharedQuantizationSpec`` + node: ``cat([conv1_out, conv2_out], ...)``. Let's say the output of ``conv1``, ``conv2``, and the first input of ``cat`` are configured + with the same configurations of ``QuantizationSpec``. The second input of ``cat`` is configured to use ``SharedQuantizationSpec`` with the first input. .. code-block:: @@ -163,15 +166,16 @@ can be an input edge or an output value. cat_input0: qspec1(dtype=torch.int8, ...) cat_input1: SharedQuantizationSpec((conv1, cat)) # conv1 node is the first input of cat - First of all, the output of ``conv1`` is implicitly sharing quantization parameter (and observer object) - with the first input of ``cat``, and same for output of ``conv2`` and the second input of ``cat``. - So since user configures the two inputs of ``cat`` to share quantization parameters, by transitivity, + First of all, the output of ``conv1`` is implicitly sharing quantization parameters (and observer object) + with the first input of ``cat``, and the same is true for the output of ``conv2`` and the second input of ``cat``. + Therefore, since the user configures the two inputs of ``cat`` to share quantization parameters, by transitivity, ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you - will see: + will see the following: + .. code-block:: - conv1 -> obs -> cat - conv2 -> obs / + conv1 -> obs -> cat + conv2 -> obs / and both ``obs`` will be the same observer instance. From 018a7dca2f428981a93916347449ac3b25ad669f Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Aug 2023 19:54:10 -0700 Subject: [PATCH 5/6] formatting --- prototype_source/pt2e_quant_ptq_static.rst | 68 +++++++++++----------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index 3736a5b0766..dc44e2030ca 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -435,47 +435,47 @@ Convert the Calibrated Model to a Quantized Model quantized_model = convert_pt2e(prepared_model) print(quantized_model) -.. note:: - At this step, we currently have two representations that you can choose from, but exact representation - we offer in the long term might change based on feedback from PyTorch users. +At this step, we currently have two representations that you can choose from, but exact representation +we offer in the long term might change based on feedback from PyTorch users. - * Q/DQ Representation (default) +* Q/DQ Representation (default) - Previous documentation for `representations `_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``. - .. code-block:: python - - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) - weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( - weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8) - weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]); - out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted) - out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( - out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) - return out_i8 - - * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators. - - You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. +.. code-block:: python - .. code-block:: python - - # Reference Quantized Pattern for quantized linear - def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): - x_int16 = x_int8.to(torch.int16) - weight_int16 = weight_int8.to(torch.int16) - acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) - acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) - bias_scale = x_scale * weight_scale - bias_int32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, bias_fp32, bias_scale / out_scale) - out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8) - return out_int8 + def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): + x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8) + weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor( + weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8) + weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]); + out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted) + out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor( + out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) + return out_i8 + +* Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators. +You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. - See `here `_ for the most up-to-date reference representations. +.. code-block:: python + + # Reference Quantized Pattern for quantized linear + def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point): + x_int16 = x_int8.to(torch.int16) + weight_int16 = weight_int8.to(torch.int16) + acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point)) + bias_scale = x_scale * weight_scale + bias_int32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale) + acc_int32 = acc_int32 + bias_int32 + acc_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) + output_zero_point + out_int8 = torch.ops.aten.clamp(acc_int32, qmin, qmax).to(torch.int8) + return out_int8 + + +See `here `_ for the most up-to-date reference representations. Checking Model Size and Accuracy Evaluation From 12386c991dbcb7a044045aceefcc31331b170470 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 23 Aug 2023 21:09:59 -0700 Subject: [PATCH 6/6] fix export code/text --- prototype_source/pt2e_quant_ptq_static.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst index dc44e2030ca..d44fbd39f76 100644 --- a/prototype_source/pt2e_quant_ptq_static.rst +++ b/prototype_source/pt2e_quant_ptq_static.rst @@ -22,7 +22,7 @@ this: \ / \ / —------------------------------------------------------- - | Dynamo Export | + | Export | —------------------------------------------------------- | FX Graph in ATen XNNPACKQuantizer, @@ -30,19 +30,19 @@ this: | or | / —-------------------------------------------------------- - | prepare_pt2e | + | prepare_pt2e | —-------------------------------------------------------- | Calibrate/Train | —-------------------------------------------------------- - | convert_pt2e | + | convert_pt2e | —-------------------------------------------------------- | Reference Quantized Model | —-------------------------------------------------------- - | Lowering | + | Lowering | —-------------------------------------------------------- | Executorch, or Inductor, or @@ -189,8 +189,6 @@ and rename it to ``data/resnet18_pretrained_float.pth``. import numpy as np import torch - from torch.ao.quantization import get_default_qconfig, QConfigMapping - from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx import torch.nn as nn from torch.utils.data import DataLoader @@ -358,7 +356,10 @@ Here is how you can use ``torch.export`` to export the model: from torch._export import capture_pre_autograd_graph example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model, _ = capture_pre_autograd_graph(model_to_quantize, *example_inputs) + exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) + # or capture with dynamic dimensions + # from torch._export import dynamic_dim + # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)]) ``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. @@ -532,9 +533,9 @@ We'll show how to save and load the quantized model. # Rerun all steps to get a quantized model model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu") model_to_quantize.eval() - import torch._dynamo as torchdynamo + from torch._export import capture_pre_autograd_graph - exported_model, _ = torchdynamo.export(model_to_quantize, *copy.deepcopy(example_inputs), aten_graph=True, tracing_mode="symbolic") + exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) from torch.ao.quantization.quantizer.xnnpack_quantizer import ( XNNPACKQuantizer, get_symmetric_quantization_config,