From a28d7634ec7369eccc01900567650ddedce7f620 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 15 Aug 2023 16:36:10 -0700
Subject: [PATCH 1/6] [pt2e][quant] Update some docs for pt2 export
 quantization

Summary:
.

Test Plan:
CI generated docs

Reviewers:

Subscribers:

Tasks:

Tags:
---
 prototype_source/pt2e_quant_ptq_static.rst | 49 +++++++++++++++-------
 prototype_source/pt2e_quantizer.rst        | 30 ++++++++++++-
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index 4e7a7ea82fa..0f756fb3e36 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -430,23 +430,42 @@ Convert the Calibrated Model to a Quantized Model
     print(quantized_model)
 
 .. note::
-   the model produced here also had some improvement upon the previous
-   `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ in the FX graph mode quantizaiton, previously all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``, in the new flow, we choose to represent some of the operators with integer computation so that it's closer to the computation happens in hardwares.
-   For example, here is how we plan to represent a quantized linear operator:
+   At this step, we currently have two representations that you can choose from, but what exact representation
+   we offer in the long term might change based on feedbacks from users.
 
-   .. code-block:: python
+   * Q/DQ Representation (default)
+   Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``.
 
-     def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_int32, bias_scale, bias_zero_point, output_scale, output_zero_point):
-         x_int16 = x_int8.to(torch.int16)
-         weight_int16 = weight_int8.to(torch.int16)
-         acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point))
-         acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale)
-         bias_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, bias_int32 - bias_zero_point, bias_scale / output_scale))
-         out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8)
-         return out_int8
-
-   For more details, please see:
-   `Quantized Model Representation <https://docs.google.com/document/d/17h-OEtD4o_hoVuPqUFsdm5uo7psiNMY8ThN03F9ZZwg/edit>`_.
+   .. code-block:: python
+      def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
+          x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                   x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+          weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                   weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+          weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]);
+          out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted)
+          out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+          out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
+          return out_i8
+     
+     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (e.g. quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators.
+
+       You can get this representation by: convert_pt2e(..., use_reference_representation=True)
+
+    .. code-block:: python
+       # Reference Quantized Pattern for quantized linear
+       def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
+           x_int16 = x_int8.to(torch.int16)
+           weight_int16 = weight_int8.to(torch.int16)
+           acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point))
+           acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale)
+           bias_scale = x_scale * weight_scale
+           bias_int32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, bias_fp32, bias_scale / out_scale)
+           out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8)
+           return out_int8
+
+
+   Please see `<here https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/pt2e/representation/rewrite.py>`_ for the most up to date reference representations.
 
 
 Checking Model Size and Accuracy Evaluation
diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst
index c4dcce4116a..4c57d70f7ce 100644
--- a/prototype_source/pt2e_quantizer.rst
+++ b/prototype_source/pt2e_quantizer.rst
@@ -9,6 +9,7 @@ Prerequisites:
 ^^^^^^^^^^^^^^^^
 
 Required:
+
 -  `Torchdynamo concepts in PyTorch <https://pytorch.org/docs/stable/dynamo/index.html>`__
    
 -  `Quantization concepts in PyTorch <https://pytorch.org/docs/master/quantization.html#quantization-api-summary>`__
@@ -16,6 +17,7 @@ Required:
 -  `(prototype) PyTorch 2.0 Export Post Training Static Quantization <https://pytorch.org/tutorials/prototype/pt2e_quant_ptq_static.html>`__
 
 Optional:
+
 -  `FX Graph Mode post training static quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`__
    
 -  `BackendConfig in PyTorch Quantization FX Graph Mode <https://pytorch.org/tutorials/prototype/backend_config_tutorial.html?highlight=backend>`__
@@ -141,7 +143,33 @@ parameters can be shared among some tensors explicitly. Two typical use cases ar
 
 ``SharedQuantizationSpec`` is designed for this use case to annotate tensors whose quantization
 parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is an ``EdgeOrNode`` object which 
-can be an input edge or an output value. 
+can be an input edge or an output value.
+
+.. note::
+   * Sharing is Transitive
+     Some Tensors might be effectively be using shared quantization spec due to (1) two nodes/edges are
+     configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes
+
+     For example, let's say we have two conv nodes conv1 and conv2, and both of them are fed into a cat
+     node. `cat([conv1_out, conv2_out], ...)` Let's say output of conv1, conv2 and first input of cat are configured
+     with the same configurations of QuantizationSpec, second input of cat is configured to use SharedQuantizationSpec
+     with the first input.
+     conv1_out: qspec1(dtype=torch.int8, ...)
+     conv2_out: qspec1(dtype=torch.int8, ...)
+     cat_input0: qspec1(dtype=torch.int8, ...)
+     cat_input1: SharedQuantizationSpec((conv1, cat))  # conv1 node is the first input of cat
+     
+     First of all, the output of conv1 are implicitly sharing quantization parameter (and observer object)
+     with first input of cat, and same for output of conv2 and second input of cat.
+     So since user configures the two input of cat to share quantization parameters, by transitivity,
+     conv2_out and conv1_out will also be sharing quantization parameters. In the observed graph, you
+     will see:
+     ```
+     conv1 -> obs -> cat
+     conv2 -> obs   /
+     ```
+     and both `obs` will be the same observer instance
+
 
 -  Input edge is the connection between input node and the node consuming the input,
    so it's a ``Tuple[Node, Node]``.

From 93aedaa480ab1c6b755551064f90860a7caa981a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 17 Aug 2023 17:30:36 -0700
Subject: [PATCH 2/6] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@fb.com>
---
 prototype_source/pt2e_quant_ptq_static.rst |  7 ++--
 prototype_source/pt2e_quantizer.rst        | 37 ++++++++++++----------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index 0f756fb3e36..a358b5361f6 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -437,6 +437,7 @@ Convert the Calibrated Model to a Quantized Model
    Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``.
 
    .. code-block:: python
+
       def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
           x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
                    x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
@@ -448,9 +449,9 @@ Convert the Calibrated Model to a Quantized Model
           out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
           return out_i8
      
-     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (e.g. quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators.
+     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators.
 
-       You can get this representation by: convert_pt2e(..., use_reference_representation=True)
+       You can get this representation by: ``convert_pt2e(..., use_reference_representation=True)``
 
     .. code-block:: python
        # Reference Quantized Pattern for quantized linear
@@ -465,7 +466,7 @@ Convert the Calibrated Model to a Quantized Model
            return out_int8
 
 
-   Please see `<here https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/pt2e/representation/rewrite.py>`_ for the most up to date reference representations.
+   See `here <https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/pt2e/representation/rewrite.py>`_ for the most up-to-date reference representations.
 
 
 Checking Model Size and Accuracy Evaluation
diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst
index 4c57d70f7ce..484ad6c94d8 100644
--- a/prototype_source/pt2e_quantizer.rst
+++ b/prototype_source/pt2e_quantizer.rst
@@ -146,29 +146,34 @@ parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is
 can be an input edge or an output value.
 
 .. note::
-   * Sharing is Transitive
-     Some Tensors might be effectively be using shared quantization spec due to (1) two nodes/edges are
+   * Sharing is transitive
+
+     Some Tensors might be effectively using shared quantization spec due to (1) two nodes/edges are
      configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes
 
-     For example, let's say we have two conv nodes conv1 and conv2, and both of them are fed into a cat
-     node. `cat([conv1_out, conv2_out], ...)` Let's say output of conv1, conv2 and first input of cat are configured
-     with the same configurations of QuantizationSpec, second input of cat is configured to use SharedQuantizationSpec
+     For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat``
+     node. `cat([conv1_out, conv2_out], ...)` Let's say output of ``conv1``, ``conv2`` and the first input of ``cat`` are configured
+     with the same configurations of ``QuantizationSpec``, second input of ``cat`` is configured to use ``SharedQuantizationSpec``
      with the first input.
-     conv1_out: qspec1(dtype=torch.int8, ...)
-     conv2_out: qspec1(dtype=torch.int8, ...)
-     cat_input0: qspec1(dtype=torch.int8, ...)
-     cat_input1: SharedQuantizationSpec((conv1, cat))  # conv1 node is the first input of cat
      
-     First of all, the output of conv1 are implicitly sharing quantization parameter (and observer object)
-     with first input of cat, and same for output of conv2 and second input of cat.
-     So since user configures the two input of cat to share quantization parameters, by transitivity,
-     conv2_out and conv1_out will also be sharing quantization parameters. In the observed graph, you
+     .. code-block::
+     
+       conv1_out: qspec1(dtype=torch.int8, ...)
+       conv2_out: qspec1(dtype=torch.int8, ...)
+       cat_input0: qspec1(dtype=torch.int8, ...)
+       cat_input1: SharedQuantizationSpec((conv1, cat))  # conv1 node is the first input of cat
+     
+     First of all, the output of ``conv1`` is implicitly sharing quantization parameter (and observer object)
+     with the first input of ``cat``, and same for output of ``conv2`` and the second input of ``cat``.
+     So since user configures the two inputs of ``cat`` to share quantization parameters, by transitivity,
+     ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you
      will see:
-     ```
+     .. code-block::
+     
      conv1 -> obs -> cat
      conv2 -> obs   /
-     ```
-     and both `obs` will be the same observer instance
+
+     and both ``obs`` will be the same observer instance.
 
 
 -  Input edge is the connection between input node and the node consuming the input,

From 7e504c1185f68e4bdb18ad8816a6481f7ed5e308 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 23 Aug 2023 14:21:06 -0700
Subject: [PATCH 3/6] Update capture API

---
 prototype_source/pt2e_quant_ptq_static.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index a358b5361f6..d1b483f3166 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -53,6 +53,7 @@ The PyTorch 2.0 export quantization API looks like this:
 .. code:: python
 
   import torch
+  from torch._export import capture_pre_autograd_graph
   class M(torch.nn.Module):
      def __init__(self):
         super().__init__()
@@ -66,7 +67,9 @@ The PyTorch 2.0 export quantization API looks like this:
   m = M().eval()
 
   # Step 1. program capture
-  m = torch._dynamo.export(m, *example_inputs, aten_graph=True)
+  # NOTE: this API will be updated to torch.export API in the future, but the captured
+  # result shoud mostly stay the same
+  m = capture_pre_autograd_graph(m, *example_inputs)
   # we get a model with aten ops
 
 
@@ -352,10 +355,13 @@ Here is how you can use ``torch.export`` to export the model:
 
 .. code-block:: python
 
-    import torch._dynamo as torchdynamo
+    from torch._export import capture_pre_autograd_graph
 
     example_inputs = (torch.rand(2, 3, 224, 224),)
-    exported_model, _ = torchdynamo.export(model_to_quantize, *example_inputs, aten_graph=True, tracing_mode="symbolic")
+    exported_model, _ = capture_pre_autograd_graph(model_to_quantize, *example_inputs)
+
+
+``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready.
 
 
 Import the Backend Specific Quantizer and Configure how to Quantize the Model

From ae209ccb01583cfc8477e103b4daaeadcb7ef286 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 23 Aug 2023 15:41:32 -0700
Subject: [PATCH 4/6] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@fb.com>
---
 prototype_source/pt2e_quant_ptq_static.rst | 13 ++++++-----
 prototype_source/pt2e_quantizer.rst        | 26 +++++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index d1b483f3166..3736a5b0766 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -436,11 +436,13 @@ Convert the Calibrated Model to a Quantized Model
     print(quantized_model)
 
 .. note::
-   At this step, we currently have two representations that you can choose from, but what exact representation
-   we offer in the long term might change based on feedbacks from users.
+   At this step, we currently have two representations that you can choose from, but exact representation
+   we offer in the long term might change based on feedback from PyTorch users.
 
    * Q/DQ Representation (default)
-   Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``.
+      
+      Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019- 
+ Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``.
 
    .. code-block:: python
 
@@ -455,11 +457,12 @@ Convert the Calibrated Model to a Quantized Model
           out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
           return out_i8
      
-     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (dq -> float32_op -> q), and q/dq are decomposed into more primitive operators.
+     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators.
 
-       You can get this representation by: ``convert_pt2e(..., use_reference_representation=True)``
+       You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``.
 
     .. code-block:: python
+    
        # Reference Quantized Pattern for quantized linear
        def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
            x_int16 = x_int8.to(torch.int16)
diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst
index 484ad6c94d8..c8cc18dba8f 100644
--- a/prototype_source/pt2e_quantizer.rst
+++ b/prototype_source/pt2e_quantizer.rst
@@ -146,14 +146,17 @@ parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is
 can be an input edge or an output value.
 
 .. note::
-   * Sharing is transitive
 
-     Some Tensors might be effectively using shared quantization spec due to (1) two nodes/edges are
-     configured to use SharedQuantizationSpec (2) there is existing sharing of some of the nodes
+   * Sharing is transitive
 
+     Some tensors might be effectively using shared quantization spec due to:
+     
+     * Two nodes/edges are configured to use ``SharedQuantizationSpec``.
+     * There is existing sharing of some nodes.
+     
      For example, let's say we have two ``conv`` nodes ``conv1`` and ``conv2``, and both of them are fed into a ``cat``
-     node. `cat([conv1_out, conv2_out], ...)` Let's say output of ``conv1``, ``conv2`` and the first input of ``cat`` are configured
-     with the same configurations of ``QuantizationSpec``, second input of ``cat`` is configured to use ``SharedQuantizationSpec``
+     node: ``cat([conv1_out, conv2_out], ...)``. Let's say the output of ``conv1``, ``conv2``, and the first input of ``cat`` are configured
+     with the same configurations of ``QuantizationSpec``. The second input of ``cat`` is configured to use ``SharedQuantizationSpec``
      with the first input.
      
      .. code-block::
@@ -163,15 +166,16 @@ can be an input edge or an output value.
        cat_input0: qspec1(dtype=torch.int8, ...)
        cat_input1: SharedQuantizationSpec((conv1, cat))  # conv1 node is the first input of cat
      
-     First of all, the output of ``conv1`` is implicitly sharing quantization parameter (and observer object)
-     with the first input of ``cat``, and same for output of ``conv2`` and the second input of ``cat``.
-     So since user configures the two inputs of ``cat`` to share quantization parameters, by transitivity,
+     First of all, the output of ``conv1`` is implicitly sharing quantization parameters (and observer object)
+     with the first input of ``cat``, and the same is true for the output of ``conv2`` and the second input of ``cat``.
+     Therefore, since the user configures the two inputs of ``cat`` to share quantization parameters, by transitivity,
      ``conv2_out`` and ``conv1_out`` will also be sharing quantization parameters. In the observed graph, you
-     will see:
+     will see the following:
+     
      .. code-block::
      
-     conv1 -> obs -> cat
-     conv2 -> obs   /
+         conv1 -> obs -> cat
+         conv2 -> obs   /
 
      and both ``obs`` will be the same observer instance.
 

From 018a7dca2f428981a93916347449ac3b25ad669f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 23 Aug 2023 19:54:10 -0700
Subject: [PATCH 5/6] formatting

---
 prototype_source/pt2e_quant_ptq_static.rst | 68 +++++++++++-----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index 3736a5b0766..dc44e2030ca 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -435,47 +435,47 @@ Convert the Calibrated Model to a Quantized Model
     quantized_model = convert_pt2e(prepared_model)
     print(quantized_model)
 
-.. note::
-   At this step, we currently have two representations that you can choose from, but exact representation
-   we offer in the long term might change based on feedback from PyTorch users.
+At this step, we currently have two representations that you can choose from, but exact representation
+we offer in the long term might change based on feedback from PyTorch users.
 
-   * Q/DQ Representation (default)
+* Q/DQ Representation (default)
       
-      Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019- 
+  Previous documentation for `representations <https://github.com/pytorch/rfcs/blob/master/RFC-0019- 
  Extending-PyTorch-Quantization-to-Custom-Backends.md>`_ all quantized operators are represented as ``dequantize -> fp32_op -> qauntize``.
 
-   .. code-block:: python
-
-      def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
-          x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                   x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
-          weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                   weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
-          weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]);
-          out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted)
-          out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
-          out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
-          return out_i8
-     
-     * Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators.
-
-       You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``.
+.. code-block:: python
 
-    .. code-block:: python
-    
-       # Reference Quantized Pattern for quantized linear
-       def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
-           x_int16 = x_int8.to(torch.int16)
-           weight_int16 = weight_int8.to(torch.int16)
-           acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point))
-           acc_rescaled_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale)
-           bias_scale = x_scale * weight_scale
-           bias_int32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, bias_fp32, bias_scale / out_scale)
-           out_int8 = torch.ops.aten.clamp(acc_rescaled_int32 + bias_int32 + output_zero_point, qmin, qmax).to(torch.int8)
-           return out_int8
+   def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
+       x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+       weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+       weight_permuted = torch.ops.aten.permute_copy.default(weight_fp32, [1, 0]);
+       out_fp32 = torch.ops.aten.addmm.default(bias_fp32, x_fp32, weight_permuted)
+       out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+       out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
+       return out_i8
+     
+* Reference Quantized Model Representation (WIP, expected to be ready at end of August): we have special representation for selected ops (for example, quantized linear), other ops are represented as (``dq -> float32_op -> q``), and ``q/dq`` are decomposed into more primitive operators.
 
+You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``.
 
-   See `here <https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/pt2e/representation/rewrite.py>`_ for the most up-to-date reference representations.
+.. code-block:: python
+   
+  # Reference Quantized Pattern for quantized linear
+  def quantized_linear(x_int8, x_scale, x_zero_point, weight_int8, weight_scale, weight_zero_point, bias_fp32, output_scale, output_zero_point):
+      x_int16 = x_int8.to(torch.int16)
+      weight_int16 = weight_int8.to(torch.int16)
+      acc_int32 = torch.ops.out_dtype(torch.mm, torch.int32, (x_int16 - x_zero_point), (weight_int16 - weight_zero_point))
+      bias_scale = x_scale * weight_scale
+      bias_int32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+      acc_int32 = acc_int32 + bias_int32
+      acc_int32 = torch.ops.out_dtype(torch.ops.aten.mul.Scalar, torch.int32, acc_int32, x_scale * weight_scale / output_scale) + output_zero_point
+      out_int8 = torch.ops.aten.clamp(acc_int32, qmin, qmax).to(torch.int8)
+      return out_int8
+
+
+See `here <https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/pt2e/representation/rewrite.py>`_ for the most up-to-date reference representations.
 
 
 Checking Model Size and Accuracy Evaluation

From 12386c991dbcb7a044045aceefcc31331b170470 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 23 Aug 2023 21:09:59 -0700
Subject: [PATCH 6/6] fix export code/text

---
 prototype_source/pt2e_quant_ptq_static.rst | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/prototype_source/pt2e_quant_ptq_static.rst b/prototype_source/pt2e_quant_ptq_static.rst
index dc44e2030ca..d44fbd39f76 100644
--- a/prototype_source/pt2e_quant_ptq_static.rst
+++ b/prototype_source/pt2e_quant_ptq_static.rst
@@ -22,7 +22,7 @@ this:
         \                                              /
          \                                            /
     —-------------------------------------------------------
-    |                    Dynamo Export                     |
+    |                        Export                        |
     —-------------------------------------------------------
                                 |
                         FX Graph in ATen     XNNPACKQuantizer,
@@ -30,19 +30,19 @@ this:
                                 |            or <Other Backend Quantizer>
                                 |                /
     —--------------------------------------------------------
-    |                 prepare_pt2e                          |
+    |                     prepare_pt2e                      |
     —--------------------------------------------------------
                                 |
                          Calibrate/Train
                                 |
     —--------------------------------------------------------
-    |                      convert_pt2e                     |
+    |                    convert_pt2e                       |
     —--------------------------------------------------------
                                 |
                     Reference Quantized Model
                                 |
     —--------------------------------------------------------
-    |                        Lowering                       |
+    |                       Lowering                        |
     —--------------------------------------------------------
                                 |
             Executorch, or Inductor, or <Other Backends>
@@ -189,8 +189,6 @@ and rename it to ``data/resnet18_pretrained_float.pth``.
     import numpy as np
 
     import torch
-    from torch.ao.quantization import get_default_qconfig, QConfigMapping
-    from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, fuse_fx
     import torch.nn as nn
     from torch.utils.data import DataLoader
 
@@ -358,7 +356,10 @@ Here is how you can use ``torch.export`` to export the model:
     from torch._export import capture_pre_autograd_graph
 
     example_inputs = (torch.rand(2, 3, 224, 224),)
-    exported_model, _ = capture_pre_autograd_graph(model_to_quantize, *example_inputs)
+    exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
+    # or capture with dynamic dimensions
+    # from torch._export import dynamic_dim
+    # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)])
 
 
 ``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready.
@@ -532,9 +533,9 @@ We'll show how to save and load the quantized model.
     # Rerun all steps to get a quantized model
     model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu")
     model_to_quantize.eval()
-    import torch._dynamo as torchdynamo
+    from torch._export import capture_pre_autograd_graph
 
-    exported_model, _ = torchdynamo.export(model_to_quantize, *copy.deepcopy(example_inputs), aten_graph=True, tracing_mode="symbolic")
+    exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
     from torch.ao.quantization.quantizer.xnnpack_quantizer import (
           XNNPACKQuantizer,
           get_symmetric_quantization_config,