pytorch
diff --git a/‎_static/img/tv_tutorial/tv_image06.png
828 KB b/‎_static/img/tv_tutorial/tv_image06.png
828 KB
diff --git a/‎_static/img/tv_tutorial/tv_image07.png
-16.8 KB b/‎_static/img/tv_tutorial/tv_image07.png
-16.8 KB
diff --git a/‎_static/torchvision_finetuning_instance_segmentation.ipynb
Lines changed: 0 additions & 2605 deletions b/‎_static/torchvision_finetuning_instance_segmentation.ipynb
Lines changed: 0 additions & 2605 deletions
diff --git a/‎_static/tv-training-code.py
Lines changed: 443 additions & 73 deletions b/‎_static/tv-training-code.py
Lines changed: 443 additions & 73 deletions
diff --git a/‎en-wordlist.txt
Lines changed: 1 addition & 0 deletions b/‎en-wordlist.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎intermediate_source/scaled_dot_product_attention_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎intermediate_source/scaled_dot_product_attention_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎intermediate_source/torchvision_tutorial.rst
Lines changed: 431 additions & 329 deletions b/‎intermediate_source/torchvision_tutorial.rst
Lines changed: 431 additions & 329 deletions
diff --git a/‎prototype_source/pt2e_quant_ptq_static.rst
Lines changed: 43 additions & 30 deletions b/‎prototype_source/pt2e_quant_ptq_static.rst
Lines changed: 43 additions & 30 deletions
@@ -189,6 +189,7 @@ TensorBoards
 TensorDict
 TensorFloat
 TextVQA
+TODO
 Tokenization
 TorchDynamo
 TorchInductor
 
@@ -317,7 +317,7 @@ def generate_rand_batch(
 # on the same set of functions for both modules.
 # The reason for this here is that ``torch.compile`` is very good at removing the
 # framework overhead associated with PyTorch. If your model is launching
-# large, efficient CUDA kernels, which in this case ``CausaulSelfAttention``
+# large, efficient CUDA kernels, which in this case ``CausalSelfAttention``
 # is, then the overhead of PyTorch can be hidden.
 #
 # In reality, your module does not normally consist of a singular
 
@@ -508,6 +508,10 @@ Now we can compare the size and model accuracy with baseline model.
    target device, it's just a representation of quantized computation in ATen
    operators.
 
+.. note::
+   The weights are still in fp32 right now, we may do constant propagation for quantize op to
+   get integer weights in the future.
+
 If you want to get better accuracy or performance,  try configuring
 ``quantizer`` in different ways, and each ``quantizer`` will have its own way
 of configuration, so please consult the documentation for the
@@ -519,46 +523,54 @@ Save and Load Quantized Model
 
 We'll show how to save and load the quantized model.
 
-.. code-block:: python
 
-    # 1. Save state_dict
-    pt2e_quantized_model_file_path = saved_model_dir + "resnet18_pt2e_quantized.pth"
-    torch.save(quantized_model.state_dict(), pt2e_quantized_model_file_path)
+.. code-block:: python
 
-    # Get a reference output
+    # 0. Store reference output, for example, inputs, and check evaluation accuracy:
     example_inputs = (next(iter(data_loader))[0],)
     ref = quantized_model(*example_inputs)
+    top1, top5 = evaluate(quantized_model, criterion, data_loader_test)
+    print("[before serialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))
 
-    # 2. Initialize the quantized model and Load state_dict
-    # Rerun all steps to get a quantized model
-    model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu")
-    model_to_quantize.eval()
-    from torch._export import capture_pre_autograd_graph
-
-    exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
-    from torch.ao.quantization.quantizer.xnnpack_quantizer import (
-          XNNPACKQuantizer,
-          get_symmetric_quantization_config,
-    )
+    # 1. Export the model and Save ExportedProgram
+    pt2e_quantized_model_file_path = saved_model_dir + "resnet18_pt2e_quantized.pth"
+    # capture the model to get an ExportedProgram
+    quantized_ep = torch.export.export(quantized_model, example_inputs)
+    # use torch.export.save to save an ExportedProgram
+    torch.export.save(quantized_ep, pt2e_quantized_model_file_path)
 
-    quantizer = XNNPACKQuantizer()
-    quantizer.set_global(get_symmetric_quantization_config())
-    prepared_model = prepare_pt2e(exported_model, quantizer)
-    prepared_model(*example_inputs)
-    loaded_quantized_model = convert_pt2e(prepared_model)
 
-    # load the state_dict from saved file to intialized model
-    loaded_quantized_model.load_state_dict(torch.load(pt2e_quantized_model_file_path))
+    # 2. Load the saved ExportedProgram
+    loaded_quantized_ep = torch.export.load(pt2e_quantized_model_file_path)
+    loaded_quantized_model = loaded_quantized_ep.module()
 
-    # Sanity check with sample data
+    # 3. Check results for example inputs and check evaluation accuracy again:
     res = loaded_quantized_model(*example_inputs)
-
-    # 3. Evaluate the loaded quantized model
+    print("diff:", ref - res)
+    
     top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test)
     print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))
 
+
+Output:
+
+
+.. code-block:: python
+                
+   [before serialization] Evaluation accuracy on test dataset: 79.82, 94.55
+   diff: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
+           [0., 0., 0.,  ..., 0., 0., 0.],
+           [0., 0., 0.,  ..., 0., 0., 0.],
+           ...,
+           [0., 0., 0.,  ..., 0., 0., 0.],
+           [0., 0., 0.,  ..., 0., 0., 0.],
+           [0., 0., 0.,  ..., 0., 0., 0.]])
+
+   [after serialization/deserialization] Evaluation accuracy on test dataset: 79.82, 94.55
+
+
 Debugging the Quantized Model
-----------------------------
+------------------------------
 
 You can use `Numeric Suite <https://pytorch.org/docs/stable/quantization-accuracy-debugging.html#numerical-debugging-tooling-prototype>`_
 that can help with debugging in eager mode and FX graph mode. The new version of
@@ -569,9 +581,10 @@ Lowering and Performance Evaluation
 
 The model produced at this point is not the final model that runs on the device,
 it is a reference quantized model that captures the intended quantized computation
-from the user, expressed as ATen operators, to get a model that runs on real
-devices, we'll need to lower the model. For example for the models that run on
-edge devices, we can lower to executorch.
+from the user, expressed as ATen operators and some additional quantize/dequantize operators,
+to get a model that runs on real devices, we'll need to lower the model.
+For example, for the models that run on edge devices, we can lower with delegation and ExecuTorch runtime
+operators.
 
 Conclusion
 --------------
Original file line number	Diff line number	Diff line change
`@@ -317,7 +317,7 @@ def generate_rand_batch(`
`317`	`317`	`# on the same set of functions for both modules.`
`318`	`318`	# The reason for this here is that ``torch.compile`` is very good at removing the
`319`	`319`	`# framework overhead associated with PyTorch. If your model is launching`
`320`		-# large, efficient CUDA kernels, which in this case ``CausaulSelfAttention``
	`320`	+# large, efficient CUDA kernels, which in this case ``CausalSelfAttention``
`321`	`321`	`# is, then the overhead of PyTorch can be hidden.`
`322`	`322`	`#`
`323`	`323`	`# In reality, your module does not normally consist of a singular`