From a9bd604fa3b514313904987aaafcfb8a3045e352 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Thu, 18 Jun 2020 10:14:39 -0700
Subject: [PATCH 01/19] Update feature classification labels

---
 .../dynamic_quantization_tutorial.py          |  4 +-
 .../static_quantization_tutorial.py           |  2 +-
 index.rst                                     | 12 ++--
 .../dynamic_quantization_bert_tutorial.rst    |  8 +--
 intermediate_source/memory_format_tutorial.py | 26 ++++----
 intermediate_source/named_tensor_tutorial.py  |  2 +-
 .../quantized_transfer_learning_tutorial.rst  | 10 +--
 intermediate_source/rpc_tutorial.rst          |  2 +-
 .../recipes/dynamic_quantization.py           | 64 +++++++++----------
 9 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py
index 990fd688651..cabdd90d224 100644
--- a/advanced_source/dynamic_quantization_tutorial.py
+++ b/advanced_source/dynamic_quantization_tutorial.py
@@ -1,5 +1,5 @@
 """
-(experimental) Dynamic Quantization on an LSTM Word Language Model
+(beta) Dynamic Quantization on an LSTM Word Language Model
 ==================================================================
 
 **Author**: `James Reed <https://github.com/jamesr66a>`_
@@ -13,7 +13,7 @@
 to int, which can result in smaller model size and faster inference with only a small
 hit to accuracy.
 
-In this tutorial, we'll apply the easiest form of quantization - 
+In this tutorial, we'll apply the easiest form of quantization -
 `dynamic quantization <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`_ -
 to an LSTM-based next word-prediction model, closely following the
 `word language model <https://github.com/pytorch/examples/tree/master/word_language_model>`_
diff --git a/advanced_source/static_quantization_tutorial.py b/advanced_source/static_quantization_tutorial.py
index 60f9112a8c1..72ea3cc703e 100644
--- a/advanced_source/static_quantization_tutorial.py
+++ b/advanced_source/static_quantization_tutorial.py
@@ -1,5 +1,5 @@
 """
-(experimental) Static Quantization with Eager Mode in PyTorch
+(beta) Static Quantization with Eager Mode in PyTorch
 =========================================================
 
 **Author**: `Raghuraman Krishnamoorthi <https://github.com/raghuramank100>`_
diff --git a/index.rst b/index.rst
index 81113c919ed..9a8762e8026 100644
--- a/index.rst
+++ b/index.rst
@@ -203,14 +203,14 @@ Welcome to PyTorch Tutorials
 .. Frontend APIs
 
 .. customcarditem::
-   :header: (experimental) Introduction to Named Tensors in PyTorch
+   :header: (prototype) Introduction to Named Tensors in PyTorch
    :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym.
    :image: _static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png
    :link: intermediate/memory_format_tutorial.html
    :tags: Frontend-APIs,Named-Tensor,Best-Practice
 
 .. customcarditem::
-   :header: (experimental) Channels Last Memory Format in PyTorch
+   :header: (beta) Channels Last Memory Format in PyTorch
    :card_description: Get an overview of Channels Last memory format and understand how it is used to order NCHW tensors in memory preserving dimensions.
    :image: _static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png
    :link: intermediate/memory_format_tutorial.html
@@ -261,28 +261,28 @@ Welcome to PyTorch Tutorials
    :tags: Model-Optimization,Best-Practice
 
 .. customcarditem::
-   :header: (experimental) Dynamic Quantization on an LSTM Word Language Model
+   :header: (beta) Dynamic Quantization on an LSTM Word Language Model
    :card_description: Apply dynamic quantization, the easiest form of quantization, to a LSTM-based next word prediction model.
    :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-an-LSTM-Word-Language-Model.png
    :link: advanced/dynamic_quantization_tutorial.html
    :tags: Text,Quantization,Model-Optimization
 
 .. customcarditem::
-   :header: (experimental) Dynamic Quantization on BERT
+   :header: (beta) Dynamic Quantization on BERT
    :card_description: Apply the dynamic quantization on a BERT (Bidirectional Embedding Representations from Transformers) model.
    :image: _static/img/thumbnails/cropped/experimental-Dynamic-Quantization-on-BERT.png
    :link: intermediate/dynamic_quantization_bert_tutorial.html
    :tags: Text,Quantization,Model-Optimization
 
 .. customcarditem::
-   :header: (experimental) Static Quantization with Eager Mode in PyTorch
+   :header: (beta) Static Quantization with Eager Mode in PyTorch
    :card_description: Learn techniques to impove a model's accuracy =  post-training static quantization, per-channel quantization, and quantization-aware training.
    :image: _static/img/thumbnails/cropped/experimental-Static-Quantization-with-Eager-Mode-in-PyTorch.png
    :link: advanced/static_quantization_tutorial.html
    :tags: Image/Video,Quantization,Model-Optimization
 
 .. customcarditem::
-   :header: (experimental) Quantized Transfer Learning for Computer Vision Tutorial
+   :header: (beta) Quantized Transfer Learning for Computer Vision Tutorial
    :card_description: Learn techniques to impove a model's accuracy -  post-training static quantization, per-channel quantization, and quantization-aware training.
    :image: _static/img/thumbnails/cropped/experimental-Quantized-Transfer-Learning-for-Computer-Vision-Tutorial.png
    :link: advanced/static_quantization_tutorial.html
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
index c3c800bbf89..6642f6768c8 100644
--- a/intermediate_source/dynamic_quantization_bert_tutorial.rst
+++ b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -1,10 +1,10 @@
-(experimental) Dynamic Quantization on BERT
+(beta) Dynamic Quantization on BERT
 ===========================================
 
 .. tip::
-   To get the most of this tutorial, we suggest using this 
+   To get the most of this tutorial, we suggest using this
    `Colab Version <https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/dynamic_quantization_bert_tutorial.ipynb>`_. This will allow you to experiment with the information presented below.
- 
+
 **Author**: `Jianyu Huang <https://github.com/jianyuh>`_
 
 **Reviewed by**: `Raghuraman Krishnamoorthi <https://github.com/raghuramank100>`_
@@ -71,7 +71,7 @@ built-in F1 score calculation helper function.
    pip install transformers
 
 
-Because we will be using the experimental parts of the PyTorch, it is
+Because we will be using the beta parts of the PyTorch, it is
 recommended to install the latest version of torch and torchvision. You
 can find the most recent instructions on local installation `here
 <https://pytorch.org/get-started/locally/>`_. For example, to install on
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index 2c3109de1e1..e988457d93b 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-(experimental) Channels Last Memory Format in PyTorch
+(beta) Channels Last Memory Format in PyTorch
 *******************************************************
 **Author**: `Vitaly Fedyunin <https://github.com/VitalyFedyunin>`_
 
@@ -11,12 +11,12 @@
 
 For example, classic (contiguous) storage of NCHW tensor (in our case it is two 2x2 images with 3 color channels) look like this:
 
-.. figure:: /_static/img/classic_memory_format.png   
+.. figure:: /_static/img/classic_memory_format.png
    :alt: classic_memory_format
 
 Channels Last memory format orders data differently:
 
-.. figure:: /_static/img/channels_last_memory_format.png   
+.. figure:: /_static/img/channels_last_memory_format.png
    :alt: channels_last_memory_format
 
 Pytorch supports memory formats (and provides back compatibility with existing models including eager, JIT, and TorchScript) by utilizing  existing strides structure.
@@ -34,7 +34,7 @@
 # Memory Format API
 # -----------------------
 #
-# Here is how to convert tensors between contiguous and channels 
+# Here is how to convert tensors between contiguous and channels
 # last memory formats.
 
 ######################################################################
@@ -104,7 +104,7 @@
 ######################################################################
 # Performance Gains
 # -------------------------------------------------------------------------------------------
-# The most significant performance gains are observed on NVidia's hardware with 
+# The most significant performance gains are observed on NVidia's hardware with
 # Tensor Cores support. We were able to archive over 22%  perf gains while running '
 # AMP (Automated Mixed Precision) training scripts supplied by NVidia https://github.com/NVIDIA/apex.
 #
@@ -144,7 +144,7 @@
 
 ######################################################################
 # Passing ``--channels-last true`` allows running a model in Channels Last format with observed 22% perf gain.
-# 
+#
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data``
 
 # opt_level = O2
@@ -192,7 +192,7 @@
 # Converting existing models
 # --------------------------
 #
-# Channels Last support not limited by existing models, as any model can be converted to Channels Last and propagate format through the graph as soon as input formatted correctly. 
+# Channels Last support not limited by existing models, as any model can be converted to Channels Last and propagate format through the graph as soon as input formatted correctly.
 #
 
 # Need to be done once, after model initialization (or load)
@@ -203,12 +203,12 @@
 output = model(input)
 
 #######################################################################
-# However, not all operators fully converted to support Channels Last (usually returning 
-# contiguous output instead). That means you need to verify the list of used operators 
-# against supported operators list https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support, 
+# However, not all operators fully converted to support Channels Last (usually returning
+# contiguous output instead). That means you need to verify the list of used operators
+# against supported operators list https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support,
 # or introduce memory format checks into eager execution mode and run your model.
-# 
-# After running the code below, operators will raise an exception if the output of the 
+#
+# After running the code below, operators will raise an exception if the output of the
 # operator doesn't match the memory format of the input.
 #
 #
@@ -282,7 +282,7 @@ def attribute(m):
 
 ######################################################################
 # If you found an operator that doesn't support Channels Last tensors
-# and you want to contribute, feel free to use following developers 
+# and you want to contribute, feel free to use following developers
 # guide https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators.
 #
 
diff --git a/intermediate_source/named_tensor_tutorial.py b/intermediate_source/named_tensor_tutorial.py
index 09946a50809..34941604083 100644
--- a/intermediate_source/named_tensor_tutorial.py
+++ b/intermediate_source/named_tensor_tutorial.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-(experimental) Introduction to Named Tensors in PyTorch
+(prototype) Introduction to Named Tensors in PyTorch
 *******************************************************
 **Author**: `Richard Zou <https://github.com/zou3519>`_
 
diff --git a/intermediate_source/quantized_transfer_learning_tutorial.rst b/intermediate_source/quantized_transfer_learning_tutorial.rst
index 5d734922aed..a4be9eafe9a 100644
--- a/intermediate_source/quantized_transfer_learning_tutorial.rst
+++ b/intermediate_source/quantized_transfer_learning_tutorial.rst
@@ -1,10 +1,10 @@
-(experimental) Quantized Transfer Learning for Computer Vision Tutorial
+(beta) Quantized Transfer Learning for Computer Vision Tutorial
 ========================================================================
 
 .. tip::
-   To get the most of this tutorial, we suggest using this 
-   `Colab Version <https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/quantized_transfer_learning_tutorial.ipynb>`_. 
-   This will allow you to experiment with the information presented below. 
+   To get the most of this tutorial, we suggest using this
+   `Colab Version <https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/quantized_transfer_learning_tutorial.ipynb>`_.
+   This will allow you to experiment with the information presented below.
 
 **Author**: `Zafar Takhirov <https://github.com/z-a-f>`_
 
@@ -62,7 +62,7 @@ such as installations and data loading/visualizations.
 Installing the Nightly Build
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Because you will be using the experimental parts of the PyTorch, it is
+Because you will be using the beta parts of the PyTorch, it is
 recommended to install the latest version of ``torch`` and
 ``torchvision``. You can find the most recent instructions on local
 installation `here <https://pytorch.org/get-started/locally/>`_.
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
index 6d149e80837..16d47f9379a 100644
--- a/intermediate_source/rpc_tutorial.rst
+++ b/intermediate_source/rpc_tutorial.rst
@@ -5,7 +5,7 @@ Getting Started with Distributed RPC Framework
 
 This tutorial uses two simple examples to demonstrate how to build distributed
 training with the `torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__
-package which is first introduced as an experimental feature in PyTorch v1.4.
+package which is first introduced as a prototype feature in PyTorch v1.4.
 Source code of the two examples can be found in
 `PyTorch examples <https://github.com/pytorch/examples>`__.
 
diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py
index 78dc1f5408a..945ea5f70fd 100644
--- a/recipes_source/recipes/dynamic_quantization.py
+++ b/recipes_source/recipes/dynamic_quantization.py
@@ -127,13 +127,13 @@
 
 # define a very, very simple LSTM for demonstration purposes
 # in this case, we are wrapping nn.LSTM, one layer, no pre or post processing
-# inspired by 
+# inspired by
 # https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html, by Robert Guthrie
 # and https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html
 class lstm_for_demonstration(nn.Module):
   """Elementary Long Short Term Memory style model which simply wraps nn.LSTM
-     Not to be used for anything other than demonstration. 
-  """ 
+     Not to be used for anything other than demonstration.
+  """
   def __init__(self,in_dim,out_dim,depth):
      super(lstm_for_demonstration,self).__init__()
      self.lstm = nn.LSTM(in_dim,out_dim,depth)
@@ -142,7 +142,7 @@ def forward(self,inputs,hidden):
      out,hidden = self.lstm(inputs,hidden)
      return out, hidden
 
- 
+
 torch.manual_seed(29592)  # set the seed for reproducibility
 
 #shape parameters
@@ -154,32 +154,32 @@ def forward(self,inputs,hidden):
 # random data for input
 inputs = torch.randn(sequence_length,batch_size,model_dimension)
 # hidden is actually is a tuple of the initial hidden state and the initial cell state
-hidden = (torch.randn(lstm_depth,batch_size,model_dimension), torch.randn(lstm_depth,batch_size,model_dimension)) 
+hidden = (torch.randn(lstm_depth,batch_size,model_dimension), torch.randn(lstm_depth,batch_size,model_dimension))
 
 
 ######################################################################
 # 2: Do the Quantization
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # Now we get to the fun part. First we create an instance of the model
 # called float\_lstm then we are going to quantize it. We're going to use
 # the
-# 
+#
 # ::
-# 
+#
 #     torch.quantization.quantize_dynamic()
-# 
+#
 # function here (`see
 # documentation <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`__)
 # which takes the model, then a list of the submodules which we want to
 # have quantized if they appear, then the datatype we are targeting. This
 # function returns a quantized version of the original model as a new
 # module.
-# 
+#
 # That's all it takes.
-# 
+#
 
- # here is our floating point instance 
+ # here is our floating point instance
 float_lstm = lstm_for_demonstration(model_dimension, model_dimension,lstm_depth)
 
 # this is the call that does the work
@@ -206,7 +206,7 @@ def forward(self,inputs,hidden):
 # (for example you can set model dimension to something like 80) this will
 # converge towards 4x smaller as the stored model size dominated more and
 # more by the parameter values.
-# 
+#
 
 def print_size_of_model(model, label=""):
     torch.save(model.state_dict(), "temp.p")
@@ -221,7 +221,7 @@ def print_size_of_model(model, label=""):
 print("{0:.2f} times smaller".format(f/q))
 
 # note that this value is wrong in PyTorch 1.4 due to https://github.com/pytorch/pytorch/issues/31468
-# this will be fixed in 1.5 with https://github.com/pytorch/pytorch/pull/31540    
+# this will be fixed in 1.5 with https://github.com/pytorch/pytorch/pull/31540
 
 
 ######################################################################
@@ -229,15 +229,15 @@ def print_size_of_model(model, label=""):
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # The second benefit is that the quantized model will typically run
 # faster. This is due to a combinations of effects including at least:
-# 
+#
 # 1. Less time spent moving parameter data in
 # 2. Faster INT8 operations
-# 
+#
 # As you will see the quantized version of this super-simple network runs
 # faster. This will generally be true of more complex networks but as they
 # say "your milage may vary" depending on a number of factors including
 # the structure of the model and the hardware you are running on.
-# 
+#
 
 # compare the performance
 print("Floating point FP32")
@@ -255,10 +255,10 @@ def print_size_of_model(model, label=""):
 # trained one. However, I think it is worth quickly showing that the
 # quantized network does produce output tensors that are "in the same
 # ballpark" as the original one.
-# 
+#
 # For a more detailed analysis please see the more advanced tutorials
 # referenced at the end of this recipe.
-# 
+#
 
 # run the float model
 out1, hidden1 = float_lstm(inputs, hidden)
@@ -270,7 +270,7 @@ def print_size_of_model(model, label=""):
 mag2 = torch.mean(abs(out2)).item()
 print('mean absolute value of output tensor values in the INT8 model is {0:.5f}'.format(mag2))
 
-# compare them 
+# compare them
 mag3 = torch.mean(abs(out1-out2)).item()
 print('mean absolute value of the difference between the output tensors is {0:.5f} or {1:.2f} percent'.format(mag3,mag3/mag1*100))
 
@@ -281,26 +281,26 @@ def print_size_of_model(model, label=""):
 # We've explained what dynamic quantization is, what benefits it brings,
 # and you have used the ``torch.quantization.quantize_dynamic()`` function
 # to quickly quantize a simple LSTM model.
-# 
+#
 # This was a fast and high level treatment of this material; for more
-# detail please continue learning with `(experimental) Dynamic Quantization on an LSTM Word Language Model Tutorial <https://pytorch.org/tutorials/advanced/dynamic\_quantization\_tutorial.html>`_.
-# 
-# 
+# detail please continue learning with `(beta) Dynamic Quantization on an LSTM Word Language Model Tutorial <https://pytorch.org/tutorials/advanced/dynamic\_quantization\_tutorial.html>`_.
+#
+#
 # Additional Resources
 # =========
 # Documentation
 # ~~~~~~~~~~~~~~
-# 
+#
 # `Quantization API Documentaion <https://pytorch.org/docs/stable/quantization.html>`_
-# 
+#
 # Tutorials
 # ~~~~~~~~~~~~~~
-# 
-# `(experimental) Dynamic Quantization on BERT <https://pytorch.org/tutorials/intermediate/dynamic\_quantization\_bert\_tutorial.html>`_
-# 
-# `(experimental) Dynamic Quantization on an LSTM Word Language Model <https://pytorch.org/tutorials/advanced/dynamic\_quantization\_tutorial.html>`_
-# 
+#
+# `(beta) Dynamic Quantization on BERT <https://pytorch.org/tutorials/intermediate/dynamic\_quantization\_bert\_tutorial.html>`_
+#
+# `(beta) Dynamic Quantization on an LSTM Word Language Model <https://pytorch.org/tutorials/advanced/dynamic\_quantization\_tutorial.html>`_
+#
 # Blogs
 # ~~~~~~~~~~~~~~
 # ` Introduction to Quantization on PyTorch <https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_
-# 
+#

From 45d02c7138ab9041cbe90255850d701da8b0ebc1 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Mon, 22 Jun 2020 19:52:28 -0700
Subject: [PATCH 02/19] Update NVidia -> Nvidia

---
 intermediate_source/memory_format_tutorial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index e988457d93b..244e23ac204 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -104,9 +104,9 @@
 ######################################################################
 # Performance Gains
 # -------------------------------------------------------------------------------------------
-# The most significant performance gains are observed on NVidia's hardware with
+# The most significant performance gains are observed on Nvidia's hardware with
 # Tensor Cores support. We were able to archive over 22%  perf gains while running '
-# AMP (Automated Mixed Precision) training scripts supplied by NVidia https://github.com/NVIDIA/apex.
+# AMP (Automated Mixed Precision) training scripts supplied by Nvidia https://github.com/NVIDIA/apex.
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2  ./data``
 

From 68c22a0baf3b838482b59cecb70d91f082b796c1 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 8 Jul 2020 11:33:22 -0400
Subject: [PATCH 03/19] Bring back default filename_pattern so that by default
 we run all galleries.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf.py b/conf.py
index 93cba840893..26d2bb42df6 100644
--- a/conf.py
+++ b/conf.py
@@ -65,6 +65,7 @@
     'examples_dirs': ['beginner_source', 'intermediate_source',
                       'advanced_source', 'recipes_source'],
     'gallery_dirs': ['beginner', 'intermediate', 'advanced', 'recipes'],
+    'filename_pattern': 'tutorial.py',
     'backreferences_dir': False
 }
 

From b6d1838fcb1768d0341acd0526e2bd9e1db529ca Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 14:27:39 -0700
Subject: [PATCH 04/19] Add prototype_source directory

---
 prototype_source/README.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 prototype_source/README.md

diff --git a/prototype_source/README.md b/prototype_source/README.md
new file mode 100644
index 00000000000..41003507b84
--- /dev/null
+++ b/prototype_source/README.md
@@ -0,0 +1,7 @@
+# Prototype Tutorials
+
+This directory contains tutorials demonstrating prototype features in PyTorch. 
+
+**Prototype features** are not available as part of binary distributions like PyPI or Conda (except maybe behind run-time flags). To test these features we would, depending on the feature, recommend building from master or using the nightly wheelss that are made available on pytorch.org. 
+
+*Level of commitment:* We are committing to gathering high bandwidth feedback only on these features. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. 

From 01fc1306debe681a893c4fa1ac08f588c6ac13a6 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 14:32:11 -0700
Subject: [PATCH 05/19] Add prototype directory

---
 .jenkins/build.sh | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index d8f9fc28cfc..3fda35f457f 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -86,6 +86,17 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
       FILES_TO_RUN+=($(basename $filename .py))
     fi
     count=$((count+1))
+   done
+    done
+   for filename in $(find prototype_source/ -name '*.py' -not -path '*/data/*'); do
+    if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
+      echo "Removing runnable code from "$filename
+      python $DIR/remove_runnable_code.py $filename $filename
+    else
+      echo "Keeping "$filename
+      FILES_TO_RUN+=($(basename $filename .py))
+    fi
+    count=$((count+1))
   done
   echo "FILES_TO_RUN: " ${FILES_TO_RUN[@]}
 
@@ -94,13 +105,13 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
 
   # Step 4: If any of the generated files are not related the tutorial files we want to run,
   # then we remove them
-  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes -name '*.html'); do
+  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes docs/prototype -name '*.html'); do
     file_basename=$(basename $filename .html)
     if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
       rm $filename
     fi
   done
-  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes -name '*.rst'); do
+  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes docs/prototype -name '*.rst'); do
     file_basename=$(basename $filename .rst)
     if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
       rm $filename
@@ -124,7 +135,7 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
       rm $filename
     fi
   done
-  for filename in $(find docs/.doctrees/beginner docs/.doctrees/intermediate docs/.doctrees/advanced docs/.doctrees/recipes -name '*.doctree'); do
+  for filename in $(find docs/.doctrees/beginner docs/.doctrees/intermediate docs/.doctrees/advanced docs/.doctrees/recipes docs/.doctrees/prototype -name '*.doctree'); do
     file_basename=$(basename $filename .doctree)
     if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
       rm $filename

From 26511ccdb6def7b4aa53674fe5c2a861418365a8 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 14:33:15 -0700
Subject: [PATCH 06/19] Add prototype

---
 conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf.py b/conf.py
index 26d2bb42df6..683abf00899 100644
--- a/conf.py
+++ b/conf.py
@@ -63,8 +63,8 @@
 
 sphinx_gallery_conf = {
     'examples_dirs': ['beginner_source', 'intermediate_source',
-                      'advanced_source', 'recipes_source'],
-    'gallery_dirs': ['beginner', 'intermediate', 'advanced', 'recipes'],
+                      'advanced_source', 'recipes_source', 'prototype_source'],
+    'gallery_dirs': ['beginner', 'intermediate', 'advanced', 'recipes', 'prototype'],
     'filename_pattern': 'tutorial.py',
     'backreferences_dir': False
 }

From fb779e1f7cfe65269e54878c9993f4cb4222bb87 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 14:57:55 -0700
Subject: [PATCH 07/19] Remove extra "done"

---
 .jenkins/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 3fda35f457f..95ad25d52da 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -87,7 +87,6 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
     fi
     count=$((count+1))
    done
-    done
    for filename in $(find prototype_source/ -name '*.py' -not -path '*/data/*'); do
     if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
       echo "Removing runnable code from "$filename

From 494d037fcc7ccada21c1dc1003cb82cac11cb02b Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 17:03:22 -0700
Subject: [PATCH 08/19] Add REAME.txt

---
 prototype_source/README.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 prototype_source/README.txt

diff --git a/prototype_source/README.txt b/prototype_source/README.txt
new file mode 100644
index 00000000000..b04371db306
--- /dev/null
+++ b/prototype_source/README.txt
@@ -0,0 +1,2 @@
+Prototype Tutorials
+------------------

From d32aa04a5a5bdde5764e550be8a8c0c905566f09 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 17:58:00 -0700
Subject: [PATCH 09/19] Update for prototype instructions

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ca418738738..7ad9c816974 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github
 Here's how to create a new tutorial or recipe:
 1. Create a notebook styled python file. If you want it executed while inserted into documentation, save the file with suffix `tutorial` so that file name is `your_tutorial.py`.
 2. Put it in one of the beginner_source, intermediate_source, advanced_source based on the level. If it is a recipe, add to recipes_source.
-2. For Tutorials, include it in the TOC tree at index.rst
-3. For Tutorials, create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/master/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/master/recipes_source/recipes_index.rst)
+2. For Tutorials (except if prototype), include it in the TOC tree at index.rst
+3. For Tutorials (except for prototype), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/master/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/master/recipes_source/recipes_index.rst)
 
 In case you prefer to write your tutorial in jupyter, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to python file. After conversion and addition to the project, please make sure the sections headings etc are in logical order.
 

From 67f76d325999e42f7846dff69485ae2ce4ca0b18 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Wed, 8 Jul 2020 17:58:49 -0700
Subject: [PATCH 10/19] Update for prototype feature

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7ad9c816974..5a9edf740ad 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github
 Here's how to create a new tutorial or recipe:
 1. Create a notebook styled python file. If you want it executed while inserted into documentation, save the file with suffix `tutorial` so that file name is `your_tutorial.py`.
 2. Put it in one of the beginner_source, intermediate_source, advanced_source based on the level. If it is a recipe, add to recipes_source.
-2. For Tutorials (except if prototype), include it in the TOC tree at index.rst
-3. For Tutorials (except for prototype), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/master/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/master/recipes_source/recipes_index.rst)
+2. For Tutorials (except if it is a prototype feature), include it in the TOC tree at index.rst
+3. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/master/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/master/recipes_source/recipes_index.rst)
 
 In case you prefer to write your tutorial in jupyter, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to python file. After conversion and addition to the project, please make sure the sections headings etc are in logical order.
 

From 958aa33b7a92df79e094bdd1ddeb2e92ba466184 Mon Sep 17 00:00:00 2001
From: Yang Gu <yangu@microsoft.com>
Date: Thu, 9 Jul 2020 15:04:57 +0800
Subject: [PATCH 11/19] refine torchvision_tutorial doc for windows

---
 intermediate_source/torchvision_tutorial.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
index 93fcfd3d247..703deeab11c 100644
--- a/intermediate_source/torchvision_tutorial.rst
+++ b/intermediate_source/torchvision_tutorial.rst
@@ -56,6 +56,11 @@ If your model returns the above methods, they will make it work for both
 training and evaluation, and will use the evaluation scripts from
 ``pycocotools``.
 
+.. note ::
+  For Windows, please install ``pycocotools`` from `gautamchitnis <https://github.com/gautamchitnis/cocoapi>`__ with command 
+
+  ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI``
+
 One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has booth classes, your ``labels`` tensor should look like ``[1,2]``.
 
 Additionally, if you want to use aspect ratio grouping during training

From 9b0635d5e61ddda040891097773c667e359bb652 Mon Sep 17 00:00:00 2001
From: Hritik Bhandari <bhandari.hritik@gmail.com>
Date: Thu, 9 Jul 2020 18:51:40 +0530
Subject: [PATCH 12/19] Update neural_style_tutorial.py (#1059)

Updated the mistake in the Loading Images Section.
---
 advanced_source/neural_style_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py
index 5b8e56bdc0c..b3421a11bee 100644
--- a/advanced_source/neural_style_tutorial.py
+++ b/advanced_source/neural_style_tutorial.py
@@ -83,7 +83,7 @@
 # An important detail to note is that neural networks from the
 # torch library are trained with tensor values ranging from 0 to 1. If you
 # try to feed the networks with 0 to 255 tensor images, then the activated
-# feature maps will be unable sense the intended content and style.
+# feature maps will be unable to sense the intended content and style.
 # However, pre-trained networks from the Caffe library are trained with 0
 # to 255 tensor images. 
 #

From 3740027fe211cd593a299da3c2d1dd1b91d39ddc Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 9 Jul 2020 09:23:26 -0400
Subject: [PATCH 13/19] torch_script_custom_ops restructure (#1057)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 advanced_source/torch_script_custom_ops.rst   | 96 ++++++-------------
 .../torch_script_custom_ops/CMakeLists.txt    | 14 +++
 .../torch_script_custom_ops/op.cpp            | 35 +++++++
 3 files changed, 80 insertions(+), 65 deletions(-)
 create mode 100644 advanced_source/torch_script_custom_ops/CMakeLists.txt
 create mode 100644 advanced_source/torch_script_custom_ops/op.cpp

diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index 9127855878d..18264c9021c 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -23,7 +23,7 @@ Python and in their serialized form directly in C++.
 The following paragraphs give an example of writing a TorchScript custom op to
 call into `OpenCV <https://www.opencv.org>`_, a computer vision library written
 in C++. We will discuss how to work with tensors in C++, how to efficiently
-convert them to third party tensor formats (in this case, OpenCV ``Mat``s), how
+convert them to third party tensor formats (in this case, OpenCV ``Mat``), how
 to register your operator with the TorchScript runtime and finally how to
 compile the operator and use it in Python and C++.
 
@@ -37,27 +37,10 @@ TorchScript as a custom operator. The first step is to write the implementation
 of our custom operator in C++. Let's call the file for this implementation
 ``op.cpp`` and make it look like this:
 
-.. code-block:: cpp
-
-  #include <opencv2/opencv.hpp>
-  #include <torch/script.h>
-
-  torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) {
-    cv::Mat image_mat(/*rows=*/image.size(0),
-                      /*cols=*/image.size(1),
-                      /*type=*/CV_32FC1,
-                      /*data=*/image.data<float>());
-    cv::Mat warp_mat(/*rows=*/warp.size(0),
-                     /*cols=*/warp.size(1),
-                     /*type=*/CV_32FC1,
-                     /*data=*/warp.data<float>());
-
-    cv::Mat output_mat;
-    cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8});
-
-    torch::Tensor output = torch::from_blob(output_mat.ptr<float>(), /*sizes=*/{8, 8});
-    return output.clone();
-  }
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN warp_perspective
+  :end-before: END warp_perspective
 
 The code for this operator is quite short. At the top of the file, we include
 the OpenCV header file, ``opencv2/opencv.hpp``, alongside the ``torch/script.h``
@@ -92,12 +75,10 @@ tensors to OpenCV matrices, as OpenCV's ``warpPerspective`` expects ``cv::Mat``
 objects as inputs. Fortunately, there is a way to do this **without copying
 any** data. In the first few lines,
 
-.. code-block:: cpp
-
-  cv::Mat image_mat(/*rows=*/image.size(0),
-                    /*cols=*/image.size(1),
-                    /*type=*/CV_32FC1,
-                    /*data=*/image.data<float>());
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN image_mat
+  :end-before: END image_mat
 
 we are calling `this constructor
 <https://docs.opencv.org/trunk/d3/d63/classcv_1_1Mat.html#a922de793eabcec705b3579c5f95a643e>`_
@@ -113,12 +94,10 @@ subsequent OpenCV routines with the library's native matrix type, even though
 we're actually storing the data in a PyTorch tensor. We repeat this procedure to
 convert the ``warp`` PyTorch tensor to the ``warp_mat`` OpenCV matrix:
 
-.. code-block:: cpp
-
-  cv::Mat warp_mat(/*rows=*/warp.size(0),
-                   /*cols=*/warp.size(1),
-                   /*type=*/CV_32FC1,
-                   /*data=*/warp.data<float>());
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN warp_mat
+  :end-before: END warp_mat
 
 Next, we are ready to call the OpenCV function we were so eager to use in
 TorchScript: ``warpPerspective``. For this, we pass the OpenCV function the
@@ -126,10 +105,10 @@ TorchScript: ``warpPerspective``. For this, we pass the OpenCV function the
 called ``output_mat``. We also specify the size ``dsize`` we want the output
 matrix (image) to be. It is hardcoded to ``8 x 8`` for this example:
 
-.. code-block:: cpp
-
-  cv::Mat output_mat;
-  cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8});
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN output_mat
+  :end-before: END output_mat
 
 The final step in our custom operator implementation is to convert the
 ``output_mat`` back into a PyTorch tensor, so that we can further use it in
@@ -139,9 +118,10 @@ other direction. In this case, PyTorch provides a ``torch::from_blob`` method. A
 we want to interpret as a PyTorch tensor. The call to ``torch::from_blob`` looks
 like this:
 
-.. code-block:: cpp
-
-  torch::from_blob(output_mat.ptr<float>(), /*sizes=*/{8, 8})
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN output_tensor
+  :end-before: END output_tensor
 
 We use the ``.ptr<float>()`` method on the OpenCV ``Mat`` class to get a raw
 pointer to the underlying data (just like ``.data<float>()`` for the PyTorch
@@ -167,10 +147,10 @@ with the TorchScript runtime and compiler. This will allow the TorchScript
 compiler to resolve references to our custom operator in TorchScript code.
 Registration is very simple. For our case, we need to write:
 
-.. code-block:: cpp
-
-  static auto registry =
-    torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective);
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
+  :language: cpp
+  :start-after: BEGIN registry
+  :end-before: END registry
 
 somewhere in the global scope of our ``op.cpp`` file. This creates a global
 variable ``registry``, which will register our operator with TorchScript in its
@@ -230,22 +210,8 @@ somewhere accessible in your file system. The following paragraphs will refer to
 that location as ``/path/to/libtorch``. The contents of our ``CMakeLists.txt``
 file should then be the following:
 
-.. code-block:: cmake
-
-  cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-  project(warp_perspective)
-
-  find_package(Torch REQUIRED)
-  find_package(OpenCV REQUIRED)
-
-  # Define our library target
-  add_library(warp_perspective SHARED op.cpp)
-  # Enable C++11
-  target_compile_features(warp_perspective PRIVATE cxx_range_for)
-  # Link against LibTorch
-  target_link_libraries(warp_perspective "${TORCH_LIBRARIES}")
-  # Link against OpenCV
-  target_link_libraries(warp_perspective opencv_core opencv_imgproc)
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/CMakeLists.txt
+  :language: cpp
 
 .. warning::
 
@@ -267,7 +233,7 @@ To now build our operator, we can run the following commands from our
 
   $ mkdir build
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  $ cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)') ..
   -- The C compiler identification is GNU 5.4.0
   -- The CXX compiler identification is GNU 5.4.0
   -- Check for working C compiler: /usr/bin/cc
@@ -660,7 +626,7 @@ Along with a small ``CMakeLists.txt`` file:
 
 At this point, we should be able to build the application:
 
-.. code-block:: cpp
+.. code-block::
 
   $ mkdir build
   $ cd build
@@ -700,7 +666,7 @@ At this point, we should be able to build the application:
 
 And run it without passing a model just yet:
 
-.. code-block:: cpp
+.. code-block::
 
   $ ./example_app
   usage: example_app <path-to-exported-script-module>
@@ -727,7 +693,7 @@ The last line will serialize the script function into a file called
 "example.pt". If we then pass this serialized model to our C++ application, we
 can run it straight away:
 
-.. code-block:: cpp
+.. code-block::
 
   $ ./example_app example.pt
   terminate called after throwing an instance of 'torch::jit::script::ErrorReport'
diff --git a/advanced_source/torch_script_custom_ops/CMakeLists.txt b/advanced_source/torch_script_custom_ops/CMakeLists.txt
new file mode 100644
index 00000000000..e116153b941
--- /dev/null
+++ b/advanced_source/torch_script_custom_ops/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(warp_perspective)
+
+find_package(Torch REQUIRED)
+find_package(OpenCV REQUIRED)
+
+# Define our library target
+add_library(warp_perspective SHARED op.cpp)
+# Enable C++14
+target_compile_features(warp_perspective PRIVATE cxx_std_14)
+# Link against LibTorch
+target_link_libraries(warp_perspective "${TORCH_LIBRARIES}")
+# Link against OpenCV
+target_link_libraries(warp_perspective opencv_core opencv_imgproc)
diff --git a/advanced_source/torch_script_custom_ops/op.cpp b/advanced_source/torch_script_custom_ops/op.cpp
new file mode 100644
index 00000000000..ba74f6da5a5
--- /dev/null
+++ b/advanced_source/torch_script_custom_ops/op.cpp
@@ -0,0 +1,35 @@
+#include <opencv2/opencv.hpp>
+#include <torch/script.h>
+
+// BEGIN warp_perspective
+torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) {
+  // BEGIN image_mat
+  cv::Mat image_mat(/*rows=*/image.size(0),
+                    /*cols=*/image.size(1),
+                    /*type=*/CV_32FC1,
+                    /*data=*/image.data_ptr<float>());
+  // END image_mat
+
+  // BEGIN warp_mat
+  cv::Mat warp_mat(/*rows=*/warp.size(0),
+                   /*cols=*/warp.size(1),
+                   /*type=*/CV_32FC1,
+                   /*data=*/warp.data_ptr<float>());
+  // END warp_mat
+
+  // BEGIN output_mat
+  cv::Mat output_mat;
+  cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8});
+  // END output_mat
+
+  // BEGIN output_tensor
+  torch::Tensor output = torch::from_blob(output_mat.ptr<float>(), /*sizes=*/{8, 8});
+  return output.clone();
+  // END output_tensor
+}
+// END warp_perspective
+
+// BEGIN registry
+static auto registry =
+  torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective);
+// END registry

From 3e32d228c437a32dcfa4f70b09e14448c2832e18 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 9 Jul 2020 07:17:18 -0700
Subject: [PATCH 14/19] Port custom ops tutorial to new registration API,
 increase testability.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 advanced_source/torch_script_custom_ops.rst   | 230 ++++++++----------
 .../torch_script_custom_ops/op.cpp            |   5 +-
 .../torch_script_custom_ops/smoke_test.py     |   3 +
 .../torch_script_custom_ops/test.py           |  34 +++
 4 files changed, 145 insertions(+), 127 deletions(-)
 create mode 100644 advanced_source/torch_script_custom_ops/smoke_test.py
 create mode 100644 advanced_source/torch_script_custom_ops/test.py

diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index 18264c9021c..cb064bf6747 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -124,7 +124,7 @@ like this:
   :end-before: END output_tensor
 
 We use the ``.ptr<float>()`` method on the OpenCV ``Mat`` class to get a raw
-pointer to the underlying data (just like ``.data<float>()`` for the PyTorch
+pointer to the underlying data (just like ``.data_ptr<float>()`` for the PyTorch
 tensor earlier). We also specify the output shape of the tensor, which we
 hardcoded as ``8 x 8``. The output of ``torch::from_blob`` is then a
 ``torch::Tensor``, pointing to the memory owned by the OpenCV matrix.
@@ -145,40 +145,28 @@ Registering the Custom Operator with TorchScript
 Now that have implemented our custom operator in C++, we need to *register* it
 with the TorchScript runtime and compiler. This will allow the TorchScript
 compiler to resolve references to our custom operator in TorchScript code.
-Registration is very simple. For our case, we need to write:
+If you have ever used the pybind11 library, our syntax for registration
+resembles the pybind11 syntax very closely.  To register a single function,
+we write:
 
 .. literalinclude:: ../advanced_source/torch_script_custom_ops/op.cpp
   :language: cpp
   :start-after: BEGIN registry
   :end-before: END registry
 
-somewhere in the global scope of our ``op.cpp`` file. This creates a global
-variable ``registry``, which will register our operator with TorchScript in its
-constructor (i.e. exactly once per program). We specify the name of the
-operator, and a pointer to its implementation (the function we wrote earlier).
-The name consists of two parts: a *namespace* (``my_ops``) and a name for the
-particular operator we are registering (``warp_perspective``). The namespace and
-operator name are separated by two colons (``::``).
+somewhere at the top level of our ``op.cpp`` file.  The ``TORCH_LIBRARY`` macro
+creates a function that will be called when your program starts.  The name
+of your library (``my_ops``) is given as the first argument (it should not
+be in quotes).  The second argument (``m``) defines a variable of type
+``torch::Library`` which is the main interface to register your operators.
+The method ``Library::def`` actually creates an operator named ``warp_perspective``,
+exposing it to both Python and TorchScript.  You can define as many operators
+as you like by making multiple calls to ``def``.
 
-.. tip::
-
-  If you want to register more than one operator, you can chain calls to
-  ``.op()`` after the constructor:
-
-  .. code-block:: cpp
-
-    static auto registry =
-      torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective)
-      .op("my_ops::another_op", &another_op)
-      .op("my_ops::and_another_op", &and_another_op);
-
-Behind the scenes, ``RegisterOperators`` will perform a number of fairly
-complicated C++ template metaprogramming magic tricks to infer the argument and
-return value types of the function pointer we pass it (``&warp_perspective``).
-This information is used to form a *function schema* for our operator. A
-function schema is a structured representation of an operator -- a kind of
-"signature" or "prototype" -- used by the TorchScript compiler to verify
-correctness in TorchScript programs.
+Behinds the scenes, the ``def`` function is actually doing quite a bit of work:
+it is using template metaprogramming to inspect the type signature of your
+function and translate it into an operator schema which specifies the operators
+type within TorchScript's type system.
 
 Building the Custom Operator
 ----------------------------
@@ -189,7 +177,16 @@ we can load into Python for research and experimentation, or into C++ for
 inference in a no-Python environment. There exist multiple ways to build our
 operator, using either pure CMake, or Python alternatives like ``setuptools``.
 For brevity, the paragraphs below only discuss the CMake approach. The appendix
-of this tutorial dives into the Python based alternatives.
+of this tutorial dives into other alternatives.
+
+Environment setup
+*****************
+
+We need an installation of PyTorch and OpenCV.  The easiest and most platform
+independent way to get both is to via Conda::
+
+  conda install -c pytorch pytorch
+  conda install opencv
 
 Building with CMake
 *******************
@@ -203,29 +200,11 @@ a directory structure that looks like this::
     op.cpp
     CMakeLists.txt
 
-Also, make sure to grab the latest version of the LibTorch distribution, which
-packages PyTorch's C++ libraries and CMake build files, from `pytorch.org
-<https://pytorch.org/get-started/locally>`_. Place the unzipped distribution
-somewhere accessible in your file system. The following paragraphs will refer to
-that location as ``/path/to/libtorch``. The contents of our ``CMakeLists.txt``
-file should then be the following:
+The contents of our ``CMakeLists.txt`` file should then be the following:
 
 .. literalinclude:: ../advanced_source/torch_script_custom_ops/CMakeLists.txt
   :language: cpp
 
-.. warning::
-
-  This setup makes some assumptions about the build environment, particularly
-  what pertains to the installation of OpenCV. The above ``CMakeLists.txt`` file
-  was tested inside a Docker container running Ubuntu Xenial with
-  ``libopencv-dev`` installed via ``apt``. If it does not work for you and you
-  feel stuck, please use the ``Dockerfile`` in the `accompanying tutorial
-  repository <https://github.com/pytorch/extension-script>`_ to
-  build an isolated, reproducible environment in which to play around with the
-  code from this tutorial. If you run into further troubles, please file an
-  issue in the tutorial repository or post a question in `our forum
-  <https://discuss.pytorch.org/>`_.
-
 To now build our operator, we can run the following commands from our
 ``warp_perspective`` folder:
 
@@ -268,24 +247,18 @@ To now build our operator, we can run the following commands from our
   [100%] Built target warp_perspective
 
 which will place a ``libwarp_perspective.so`` shared library file in the
-``build`` folder. In the ``cmake`` command above, you should replace
-``/path/to/libtorch`` with the path to your unzipped LibTorch distribution.
+``build`` folder. In the ``cmake`` command above, we use the helper
+variable ``torch.utils.cmake_prefix_path`` to conveniently tell us where
+the cmake files for our PyTorch install are.
 
 We will explore how to use and call our operator in detail further below, but to
 get an early sensation of success, we can try running the following code in
 Python:
 
-.. code-block:: python
-
-  >>> import torch
-  >>> torch.ops.load_library("/path/to/libwarp_perspective.so")
-  >>> print(torch.ops.my_ops.warp_perspective)
-
-Here, ``/path/to/libwarp_perspective.so`` should be a relative or absolute path
-to the ``libwarp_perspective.so`` shared library we just built. If all goes
-well, this should print something like
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/smoke_test.py
+  :language: python
 
-.. code-block:: python
+If all goes well, this should print something like::
 
   <built-in method my_ops::warp_perspective of PyCapsule object at 0x7f618fc6fa50>
 
@@ -302,10 +275,9 @@ TorchScript code.
 You already saw how to import your operator into Python:
 ``torch.ops.load_library()``. This function takes the path to a shared library
 containing custom operators, and loads it into the current process. Loading the
-shared library will also execute the constructor of the global
-``RegisterOperators`` object we placed into our custom operator implementation
-file. This will register our custom operator with the TorchScript compiler and
-allow us to use that operator in TorchScript code.
+shared library will also execute the ``TORCH_LIBRARY`` block. This will register
+our custom operator with the TorchScript compiler and allow us to use that
+operator in TorchScript code.
 
 You can refer to your loaded operator as ``torch.ops.<namespace>.<function>``,
 where ``<namespace>`` is the namespace part of your operator name, and
@@ -316,11 +288,16 @@ While this function can be used in scripted or traced TorchScript modules, we
 can also just use it in vanilla eager PyTorch and pass it regular PyTorch
 tensors:
 
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py
+  :language: python
+  :prepend: import torch
+  :start-after: BEGIN preamble
+  :end-before: END preamble
+
+producing:
+
 .. code-block:: python
 
-  >>> import torch
-  >>> torch.ops.load_library("libwarp_perspective.so")
-  >>> torch.ops.my_ops.warp_perspective(torch.randn(32, 32), torch.rand(3, 3))
   tensor([[0.0000, 0.3218, 0.4611,  ..., 0.4636, 0.4636, 0.4636],
         [0.3746, 0.0978, 0.5005,  ..., 0.4636, 0.4636, 0.4636],
         [0.3245, 0.0169, 0.0000,  ..., 0.4458, 0.4458, 0.4458],
@@ -332,24 +309,26 @@ tensors:
 
 .. note::
 
-	What happens behind the scenes is that the first time you access
-	``torch.ops.namespace.function`` in Python, the TorchScript compiler (in C++
-	land) will see if a function ``namespace::function`` has been registered, and
-	if so, return a Python handle to this function that we can subsequently use to
-	call into our C++ operator implementation from Python. This is one noteworthy
-	difference between TorchScript custom operators and C++ extensions: C++
-	extensions are bound manually using pybind11, while TorchScript custom ops are
-	bound on the fly by PyTorch itself. Pybind11 gives you more flexibility with
-	regards to what types and classes you can bind into Python and is thus
-	recommended for purely eager code, but it is not supported for TorchScript
-	ops.
+    What happens behind the scenes is that the first time you access
+    ``torch.ops.namespace.function`` in Python, the TorchScript compiler (in C++
+    land) will see if a function ``namespace::function`` has been registered, and
+    if so, return a Python handle to this function that we can subsequently use to
+    call into our C++ operator implementation from Python. This is one noteworthy
+    difference between TorchScript custom operators and C++ extensions: C++
+    extensions are bound manually using pybind11, while TorchScript custom ops are
+    bound on the fly by PyTorch itself. Pybind11 gives you more flexibility with
+    regards to what types and classes you can bind into Python and is thus
+    recommended for purely eager code, but it is not supported for TorchScript
+    ops.
 
 From here on, you can use your custom operator in scripted or traced code just
 as you would other functions from the ``torch`` package. In fact, "standard
 library" functions like ``torch.matmul`` go through largely the same
 registration path as custom operators, which makes custom operators really
 first-class citizens when it comes to how and where they can be used in
-TorchScript.
+TorchScript.  (One difference, however, is that standard library functions
+have custom written Python argument parsing logic that differs from
+``torch.ops`` argument parsing.)
 
 Using the Custom Operator with Tracing
 **************************************
@@ -357,10 +336,10 @@ Using the Custom Operator with Tracing
 Let's start by embedding our operator in a traced function. Recall that for
 tracing, we start with some vanilla Pytorch code:
 
-.. code-block:: python
-
-  def compute(x, y, z):
-      return x.matmul(y) + torch.relu(z)
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py
+  :language: python
+  :start-after: BEGIN compute
+  :end-before: END compute
 
 and then call ``torch.jit.trace`` on it. We further pass ``torch.jit.trace``
 some example inputs, which it will forward to our implementation to record the
@@ -368,54 +347,54 @@ sequence of operations that occur as the inputs flow through it. The result of
 this is effectively a "frozen" version of the eager PyTorch program, which the
 TorchScript compiler can further analyze, optimize and serialize:
 
-.. code-block:: python
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py
+  :language: python
+  :start-after: BEGIN trace
+  :end-before: END trace
 
-  >>> inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(4, 5)]
-  >>> trace = torch.jit.trace(compute, inputs)
-  >>> print(trace.graph)
-  graph(%x : Float(4, 8)
-      %y : Float(8, 5)
-      %z : Float(4, 5)) {
-    %3 : Float(4, 5) = aten::matmul(%x, %y)
-    %4 : Float(4, 5) = aten::relu(%z)
-    %5 : int = prim::Constant[value=1]()
-    %6 : Float(4, 5) = aten::add(%3, %4, %5)
-    return (%6);
-  }
+Producing::
+
+    graph(%x : Float(4:8, 8:1),
+          %y : Float(8:5, 5:1),
+          %z : Float(4:5, 5:1)):
+      %3 : Float(4:5, 5:1) = aten::matmul(%x, %y) # test.py:10:0
+      %4 : Float(4:5, 5:1) = aten::relu(%z) # test.py:10:0
+      %5 : int = prim::Constant[value=1]() # test.py:10:0
+      %6 : Float(4:5, 5:1) = aten::add(%3, %4, %5) # test.py:10:0
+      return (%6)
 
 Now, the exciting revelation is that we can simply drop our custom operator into
 our PyTorch trace as if it were ``torch.relu`` or any other ``torch`` function:
 
-.. code-block:: python
-
-  torch.ops.load_library("libwarp_perspective.so")
-
-  def compute(x, y, z):
-      x = torch.ops.my_ops.warp_perspective(x, torch.eye(3))
-      return x.matmul(y) + torch.relu(z)
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py
+  :language: python
+  :start-after: BEGIN compute2
+  :end-before: END compute2
 
 and then trace it as before:
 
-.. code-block:: python
-
-  >>> inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(8, 5)]
-  >>> trace = torch.jit.trace(compute, inputs)
-  >>> print(trace.graph)
-  graph(%x.1 : Float(4, 8)
-      %y : Float(8, 5)
-      %z : Float(8, 5)) {
-      %3 : int = prim::Constant[value=3]()
-      %4 : int = prim::Constant[value=6]()
-      %5 : int = prim::Constant[value=0]()
-      %6 : int[] = prim::Constant[value=[0, -1]]()
-      %7 : Float(3, 3) = aten::eye(%3, %4, %5, %6)
-      %x : Float(8, 8) = my_ops::warp_perspective(%x.1, %7)
-      %11 : Float(8, 5) = aten::matmul(%x, %y)
-      %12 : Float(8, 5) = aten::relu(%z)
-      %13 : int = prim::Constant[value=1]()
-      %14 : Float(8, 5) = aten::add(%11, %12, %13)
-      return (%14);
-    }
+.. literalinclude:: ../advanced_source/torch_script_custom_ops/test.py
+  :language: python
+  :start-after: BEGIN trace2
+  :end-before: END trace2
+
+Producing::
+
+    graph(%x.1 : Float(4:8, 8:1),
+          %y : Float(8:5, 5:1),
+          %z : Float(8:5, 5:1)):
+      %3 : int = prim::Constant[value=3]() # test.py:25:0
+      %4 : int = prim::Constant[value=6]() # test.py:25:0
+      %5 : int = prim::Constant[value=0]() # test.py:25:0
+      %6 : Device = prim::Constant[value="cpu"]() # test.py:25:0
+      %7 : bool = prim::Constant[value=0]() # test.py:25:0
+      %8 : Float(3:3, 3:1) = aten::eye(%3, %4, %5, %6, %7) # test.py:25:0
+      %x : Float(8:8, 8:1) = my_ops::warp_perspective(%x.1, %8) # test.py:25:0
+      %10 : Float(8:5, 5:1) = aten::matmul(%x, %y) # test.py:26:0
+      %11 : Float(8:5, 5:1) = aten::relu(%z) # test.py:26:0
+      %12 : int = prim::Constant[value=1]() # test.py:26:0
+      %13 : Float(8:5, 5:1) = aten::add(%10, %11, %12) # test.py:26:0
+      return (%13)
 
 Integrating TorchScript custom ops into traced PyTorch code is as easy as this!
 
@@ -947,8 +926,9 @@ custom TorchScript operator as a string. For this, use
     return output.clone();
   }
 
-  static auto registry =
-    torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective);
+  TORCH_LIBRARY(my_ops, m) {
+    m.def("warp_perspective", &warp_perspective);
+  }
   """
 
   torch.utils.cpp_extension.load_inline(
diff --git a/advanced_source/torch_script_custom_ops/op.cpp b/advanced_source/torch_script_custom_ops/op.cpp
index ba74f6da5a5..ff2eb049c4c 100644
--- a/advanced_source/torch_script_custom_ops/op.cpp
+++ b/advanced_source/torch_script_custom_ops/op.cpp
@@ -30,6 +30,7 @@ torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) {
 // END warp_perspective
 
 // BEGIN registry
-static auto registry =
-  torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective);
+TORCH_LIBRARY(my_ops, m) {
+  m.def("warp_perspective", warp_perspective);
+}
 // END registry
diff --git a/advanced_source/torch_script_custom_ops/smoke_test.py b/advanced_source/torch_script_custom_ops/smoke_test.py
new file mode 100644
index 00000000000..fa629ddcafb
--- /dev/null
+++ b/advanced_source/torch_script_custom_ops/smoke_test.py
@@ -0,0 +1,3 @@
+import torch
+torch.ops.load_library("build/libwarp_perspective.so")
+print(torch.ops.my_ops.warp_perspective)
diff --git a/advanced_source/torch_script_custom_ops/test.py b/advanced_source/torch_script_custom_ops/test.py
new file mode 100644
index 00000000000..26f96ef4599
--- /dev/null
+++ b/advanced_source/torch_script_custom_ops/test.py
@@ -0,0 +1,34 @@
+import torch
+
+
+print("BEGIN preamble")
+torch.ops.load_library("build/libwarp_perspective.so")
+print(torch.ops.my_ops.warp_perspective(torch.randn(32, 32), torch.rand(3, 3)))
+print("END preamble")
+
+
+# BEGIN compute
+def compute(x, y, z):
+    return x.matmul(y) + torch.relu(z)
+# END compute
+
+
+print("BEGIN trace")
+inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(4, 5)]
+trace = torch.jit.trace(compute, inputs)
+print(trace.graph)
+print("END trace")
+
+
+# BEGIN compute2
+def compute(x, y, z):
+    x = torch.ops.my_ops.warp_perspective(x, torch.eye(3))
+    return x.matmul(y) + torch.relu(z)
+# END compute2
+
+
+print("BEGIN trace2")
+inputs = [torch.randn(4, 8), torch.randn(8, 5), torch.randn(8, 5)]
+trace = torch.jit.trace(compute, inputs)
+print(trace.graph)
+print("END trace2")

From 999a02977c4830dbeca88f76f461f018d3df210b Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 9 Jul 2020 07:22:14 -0700
Subject: [PATCH 15/19] Kill some other occurrences of RegisterOperators

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 advanced_source/torch_script_custom_ops.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index cb064bf6747..33c80a80778 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -484,7 +484,7 @@ function inside of our script code:
 
 When the TorchScript compiler sees the reference to
 ``torch.ops.my_ops.warp_perspective``, it will find the implementation we
-registered via the ``RegisterOperators`` object in C++, and compile it into its
+registered via the ``TORCH_LIBRARY`` function in C++, and compile it into its
 graph representation:
 
 .. code-block:: python
@@ -720,7 +720,7 @@ library.
   ``-Wl,--no-as-needed`` prefix to the ``warp_perspective`` link line. This is
   required because we will not actually be calling any function from the
   ``warp_perspective`` shared library in our application code. We only need the
-  global ``RegisterOperators`` object's constructor to run. Inconveniently, this
+  ``TORCH_LIBRARY`` function to run. Inconveniently, this
   confuses the linker and makes it think it can just skip linking against the
   library altogether. On Linux, the ``-Wl,--no-as-needed`` flag forces the link
   to happen (NB: this flag is specific to Linux!). There are other workarounds

From f90f7737561a69bfe0105bfd861f92c4bab222f2 Mon Sep 17 00:00:00 2001
From: Jessica Lin <jplin@fb.com>
Date: Thu, 9 Jul 2020 07:40:35 -0700
Subject: [PATCH 16/19] Update README.md

---
 prototype_source/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/prototype_source/README.md b/prototype_source/README.md
index 41003507b84..ecb860aea68 100644
--- a/prototype_source/README.md
+++ b/prototype_source/README.md
@@ -4,4 +4,6 @@ This directory contains tutorials demonstrating prototype features in PyTorch.
 
 **Prototype features** are not available as part of binary distributions like PyPI or Conda (except maybe behind run-time flags). To test these features we would, depending on the feature, recommend building from master or using the nightly wheelss that are made available on pytorch.org. 
 
+These tutorials are intentionally left out of the pytorch.org/tutorials build and will not show up on the website.
+
 *Level of commitment:* We are committing to gathering high bandwidth feedback only on these features. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast. 

From c6059ec98dd7da6bc647e350ce2dd83248ec8e4e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 9 Jul 2020 08:14:58 -0700
Subject: [PATCH 17/19] Make torch_script_custom_classes tutorial runnable

I also fixed some warnings in the tutorial, and fixed some minor bitrot
(e.g., torch::script::Module to torch::jit::Module)

I also added some missing quotes around some bash expansions.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 .gitignore                                    |   4 +
 .../torch_script_custom_classes.rst           | 333 ++----------------
 .../CMakeLists.txt                            |  15 +
 .../custom_class_project/CMakeLists.txt       |  10 +
 .../custom_class_project/class.cpp            | 145 ++++++++
 .../custom_class_project/custom_test.py       |  48 +++
 .../custom_class_project/export_attr.py       |  21 ++
 .../custom_class_project/save.py              |  18 +
 .../torch_script_custom_classes/infer.cpp     |  20 ++
 .../torch_script_custom_classes/run.sh        |  21 ++
 .../torch_script_custom_classes/run2.sh       |  13 +
 advanced_source/torch_script_custom_ops.rst   |   6 +-
 12 files changed, 353 insertions(+), 301 deletions(-)
 create mode 100644 advanced_source/torch_script_custom_classes/CMakeLists.txt
 create mode 100644 advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt
 create mode 100644 advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
 create mode 100644 advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py
 create mode 100644 advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py
 create mode 100644 advanced_source/torch_script_custom_classes/custom_class_project/save.py
 create mode 100644 advanced_source/torch_script_custom_classes/infer.cpp
 create mode 100755 advanced_source/torch_script_custom_classes/run.sh
 create mode 100755 advanced_source/torch_script_custom_classes/run2.sh

diff --git a/.gitignore b/.gitignore
index c4a23cc1208..27c61631029 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ intermediate
 advanced
 pytorch_basics
 recipes
+prototype
 
 #data things
 _data/
@@ -117,3 +118,6 @@ ENV/
 .DS_Store
 cleanup.sh
 *.swp
+
+# PyTorch things
+*.pt
diff --git a/advanced_source/torch_script_custom_classes.rst b/advanced_source/torch_script_custom_classes.rst
index 031e6c3f696..c5d34b13100 100644
--- a/advanced_source/torch_script_custom_classes.rst
+++ b/advanced_source/torch_script_custom_classes.rst
@@ -2,7 +2,7 @@ Extending TorchScript with Custom C++ Classes
 ===============================================
 
 This tutorial is a follow-on to the
-`custom operator <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+:doc:`custom operator <torch_script_custom_ops>`
 tutorial, and introduces the API we've built for binding C++ classes into TorchScript
 and Python simultaneously. The API is very similar to
 `pybind11 <https://github.com/pybind/pybind11>`_, and most of the concepts will transfer
@@ -14,44 +14,10 @@ Implementing and Binding the Class in C++
 For this tutorial, we are going to define a simple C++ class that maintains persistent
 state in a member variable.
 
-.. code-block:: cpp
-
-  // This header is all you need to do the C++ portions of this
-  // tutorial
-  #include <torch/script.h>
-  // This header is what defines the custom class registration
-  // behavior specifically. script.h already includes this, but
-  // we include it here so you know it exists in case you want
-  // to look at the API or implementation.
-  #include <torch/custom_class.h>
-
-  #include <string>
-  #include <vector>
-
-  template <class T>
-  struct MyStackClass : torch::CustomClassHolder {
-    std::vector<T> stack_;
-    MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
-
-    void push(T x) {
-      stack_.push_back(x);
-    }
-    T pop() {
-      auto val = stack_.back();
-      stack_.pop_back();
-      return val;
-    }
-
-    c10::intrusive_ptr<MyStackClass> clone() const {
-      return c10::make_intrusive<MyStackClass>(stack_);
-    }
-
-    void merge(const c10::intrusive_ptr<MyStackClass>& c) {
-      for (auto& elem : c->stack_) {
-        push(elem);
-      }
-    }
-  };
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN class
+  :end-before: END class
 
 There are several things to note:
 
@@ -69,46 +35,10 @@ There are several things to note:
 Now let's take a look at how we will make this class visible to TorchScript, a process called
 *binding* the class:
 
-.. code-block:: cpp
-
-  // Notice a few things:
-  // - We pass the class to be registered as a template parameter to
-  //   `torch::class_`. In this instance, we've passed the
-  //   specialization of the MyStackClass class ``MyStackClass<std::string>``.
-  //   In general, you cannot register a non-specialized template
-  //   class. For non-templated classes, you can just pass the
-  //   class name directly as the template parameter.
-  // - The arguments passed to the constructor make up the "qualified name"
-  //   of the class. In this case, the registered class will appear in
-  //   Python and C++ as `torch.classes.my_classes.MyStackClass`. We call
-  //   the first argument the "namespace" and the second argument the
-  //   actual class name.
-  static auto testStack =
-    torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
-        // The following line registers the contructor of our MyStackClass
-        // class that takes a single `std::vector<std::string>` argument,
-        // i.e. it exposes the C++ method `MyStackClass(std::vector<T> init)`.
-        // Currently, we do not support registering overloaded
-        // constructors, so for now you can only `def()` one instance of
-        // `torch::init`.
-        .def(torch::init<std::vector<std::string>>())
-        // The next line registers a stateless (i.e. no captures) C++ lambda
-        // function as a method. Note that a lambda function must take a
-        // `c10::intrusive_ptr<YourClass>` (or some const/ref version of that)
-        // as the first argument. Other arguments can be whatever you want.
-        .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-          return self->stack_.back();
-        })
-        // The following four lines expose methods of the MyStackClass<std::string>
-        // class as-is. `torch::class_` will automatically examine the
-        // argument and return types of the passed-in method pointers and
-        // expose these to Python and TorchScript accordingly. Finally, notice
-        // that we must take the *address* of the fully-qualified method name,
-        // i.e. use the unary `&` operator, due to C++ typing rules.
-        .def("push", &MyStackClass<std::string>::push)
-        .def("pop", &MyStackClass<std::string>::pop)
-        .def("clone", &MyStackClass<std::string>::clone)
-        .def("merge", &MyStackClass<std::string>::merge);
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN binding
+  :end-before: END binding
 
 
 
@@ -121,18 +51,8 @@ we've covered so far and place it in a file called ``class.cpp``.
 Then, write a simple ``CMakeLists.txt`` file and place it in the
 same directory. Here is what ``CMakeLists.txt`` should look like:
 
-.. code-block:: cmake
-
-  cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-  project(custom_class)
-
-  find_package(Torch REQUIRED)
-
-  # Define our library target
-  add_library(custom_class SHARED class.cpp)
-  set(CMAKE_CXX_STANDARD 14)
-  # Link against LibTorch
-  target_link_libraries(custom_class "${TORCH_LIBRARIES}")
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt
+  :language: cmake
 
 Also, create a ``build`` directory. Your file tree should look like this::
 
@@ -141,16 +61,14 @@ Also, create a ``build`` directory. Your file tree should look like this::
     CMakeLists.txt
     build/
 
-Now, to build the project, go ahead and download the appropriate libtorch
-binary from the `PyTorch website <https://pytorch.org/>`_. Extract the
-zip archive somewhere (within the project directory might be convenient)
-and note the path you've extracted it to. Next, go ahead and invoke cmake and
-then make to build the project:
+We assume you've setup your environment in the same way as described in
+the :doc:`previous tutorial <torch_script_custom_ops>`.
+Go ahead and invoke cmake and then make to build the project:
 
 .. code-block:: shell
 
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
     -- The C compiler identification is GNU 7.3.1
     -- The CXX compiler identification is GNU 7.3.1
     -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc
@@ -201,52 +119,9 @@ Now that we have our class and its registration compiled into an ``.so`` file,
 we can load that `.so` into Python and try it out. Here's a script that
 demonstrates that:
 
-.. code-block:: python
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py
+  :language: python
 
-  import torch
-
-  # `torch.classes.load_library()` allows you to pass the path to your .so file
-  # to load it in and make the custom C++ classes available to both Python and
-  # TorchScript
-  torch.classes.load_library("libcustom_class.so")
-  # You can query the loaded libraries like this:
-  print(torch.classes.loaded_libraries)
-  # prints {'/custom_class_project/build/libcustom_class.so'}
-
-  # We can find and instantiate our custom C++ class in python by using the
-  # `torch.classes` namespace:
-  #
-  # This instantiation will invoke the MyStackClass(std::vector<T> init) constructor
-  # we registered earlier
-  s = torch.classes.my_classes.MyStackClass(["foo", "bar"])
-
-  # We can call methods in Python
-  s.push("pushed")
-  assert s.pop() == "pushed"
-
-  # Returning and passing instances of custom classes works as you'd expect
-  s2 = s.clone()
-  s.merge(s2)
-  for expected in ["bar", "foo", "bar", "foo"]:
-      assert s.pop() == expected
-
-  # We can also use the class in TorchScript
-  # For now, we need to assign the class's type to a local in order to
-  # annotate the type on the TorchScript function. This may change
-  # in the future.
-  MyStackClass = torch.classes.my_classes.MyStackClass
-
-  @torch.jit.script
-  def do_stacks(s : MyStackClass): # We can pass a custom class instance to TorchScript
-      s2 = torch.classes.my_classes.MyStackClass(["hi", "mom"]) # We can instantiate the class
-      s2.merge(s) # We can call a method on the class
-      return s2.clone(), s2.top()  # We can also return instances of the class
-                                   # from TorchScript function/methods
-
-  stack, top = do_stacks(torch.classes.my_classes.MyStackClass(["wow"]))
-  assert top == "wow"
-  for expected in ["wow", "mom", "hi"]:
-      assert stack.pop() == expected
 
 Saving, Loading, and Running TorchScript Code Using Custom Classes
 ------------------------------------------------------------------
@@ -255,24 +130,8 @@ We can also use custom-registered C++ classes in a C++ process using
 libtorch. As an example, let's define a simple ``nn.Module`` that
 instantiates and calls a method on our MyStackClass class:
 
-.. code-block:: python
-
-  import torch
-
-  torch.classes.load_library('libcustom_class.so')
-
-  class Foo(torch.nn.Module):
-      def __init__(self):
-          super().__init__()
-
-      def forward(self, s : str) -> str:
-          stack = torch.classes.my_classes.MyStackClass(["hi", "mom"])
-          return stack.pop() + s
-
-  scripted_foo = torch.jit.script(Foo())
-  print(scripted_foo.graph)
-
-  scripted_foo.save('foo.pt')
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/save.py
+  :language: python
 
 ``foo.pt`` in our filesystem now contains the serialized TorchScript
 program we've just defined.
@@ -300,55 +159,20 @@ build the custom class into the binary.
 
 Let's populate ``infer.cpp`` with the following:
 
-.. code-block:: cpp
-
-  #include <torch/script.h>
-
-  #include <iostream>
-  #include <memory>
-
-  int main(int argc, const char* argv[]) {
-    torch::script::Module module;
-    try {
-      // Deserialize the ScriptModule from a file using torch::jit::load().
-      module = torch::jit::load("foo.pt");
-    }
-    catch (const c10::Error& e) {
-      std::cerr << "error loading the model\n";
-      return -1;
-    }
-
-    std::vector<c10::IValue> inputs = {"foobarbaz"};
-    auto output = module.forward(inputs).toString();
-    std::cout << output->string() << std::endl;
-  }
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/infer.cpp
+  :language: cpp
 
 And similarly let's define our CMakeLists.txt file:
 
-.. code-block:: cmake
-
-  cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-  project(infer)
-
-  find_package(Torch REQUIRED)
-
-  add_subdirectory(custom_class_project)
-
-  # Define our library target
-  add_executable(infer infer.cpp)
-  set(CMAKE_CXX_STANDARD 14)
-  # Link against LibTorch
-  target_link_libraries(infer "${TORCH_LIBRARIES}")
-  # This is where we link in our libcustom_class code, making our
-  # custom class available in our binary.
-  target_link_libraries(infer -Wl,--no-as-needed custom_class)
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/CMakeLists.txt
+  :language: cpp
 
 You know the drill: ``cd build``, ``cmake``, and ``make``:
 
 .. code-block:: shell
 
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
     -- The C compiler identification is GNU 7.3.1
     -- The CXX compiler identification is GNU 7.3.1
     -- Check for working C compiler: /opt/rh/devtoolset-7/root/usr/bin/cc
@@ -412,7 +236,7 @@ or you want to instantiate a custom class attribute in C++. For creating an
 - In the event that you already have an ``intrusive_ptr`` pointing to your class, you
   can directly construct an IValue from it using the constructor ``IValue(intrusive_ptr<T>)``.
 
-For converting ``IValue``s back to custom classes:
+For converting ``IValue`` back to custom classes:
 
 - ``IValue::toCustomClass<T>()`` will return an ``intrusive_ptr<T>`` pointing to the
   custom class that the ``IValue`` contains. Internally, this function is checking
@@ -426,24 +250,8 @@ Defining Serialization/Deserialization Methods for Custom C++ Classes
 If you try to save a ``ScriptModule`` with a custom-bound C++ class as
 an attribute, you'll get the following error:
 
-.. code-block:: python
-
-  # export_attr.py
-  import torch
-
-  torch.classes.load_library('libcustom_class.so')
-
-  class Foo(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.stack = torch.classes.my_classes.MyStackClass(["just", "testing"])
-
-    def forward(self, s : str) -> str:
-        return self.stack.pop() + s
-
-  scripted_foo = torch.jit.script(Foo())
-
-  scripted_foo.save('foo.pt')
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py
+  :language: python
 
 .. code-block:: shell
 
@@ -464,52 +272,10 @@ the special ``def_pickle`` method on ``class_``.
 Here is an example of how we can update the registration code for our
 ``MyStackClass`` class to include serialization methods:
 
-.. code-block:: cpp
-
-  static auto testStack =
-    torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
-        .def(torch::init<std::vector<std::string>>())
-        .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-          return self->stack_.back();
-        })
-        .def("push", &MyStackClass<std::string>::push)
-        .def("pop", &MyStackClass<std::string>::pop)
-        .def("clone", &MyStackClass<std::string>::clone)
-        .def("merge", &MyStackClass<std::string>::merge)
-        // class_<>::def_pickle allows you to define the serialization
-        // and deserialization methods for your C++ class.
-        // Currently, we only support passing stateless lambda functions
-        // as arguments to def_pickle
-        .def_pickle(
-              // __getstate__
-              // This function defines what data structure should be produced
-              // when we serialize an instance of this class. The function
-              // must take a single `self` argument, which is an intrusive_ptr
-              // to the instance of the object. The function can return
-              // any type that is supported as a return value of the TorchScript
-              // custom operator API. In this instance, we've chosen to return
-              // a std::vector<std::string> as the salient data to preserve
-              // from the class.
-              [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
-                  -> std::vector<std::string> {
-                return self->stack_;
-              },
-              // __setstate__
-              // This function defines how to create a new instance of the C++
-              // class when we are deserializing. The function must take a
-              // single argument of the same type as the return value of
-              // `__getstate__`. The function must return an intrusive_ptr
-              // to a new instance of the C++ class, initialized however
-              // you would like given the serialized state.
-              [](std::vector<std::string> state)
-                  -> c10::intrusive_ptr<MyStackClass<std::string>> {
-                // A convenient way to instantiate an object and get an
-                // intrusive_ptr to it is via `make_intrusive`. We use
-                // that here to allocate an instance of MyStackClass<std::string>
-                // and call the single-argument std::vector<std::string>
-                // constructor with the serialized state.
-                return c10::make_intrusive<MyStackClass<std::string>>(std::move(state));
-              });
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN pickle_binding
+  :end-before: END pickle_binding
 
 .. note::
   We take a different approach from pybind11 in the pickle API. Whereas pybind11
@@ -520,27 +286,6 @@ Here is an example of how we can update the registration code for our
 Once we have defined the (de)serialization behavior in this way, our script can
 now run successfully:
 
-.. code-block:: python
-
-  import torch
-
-  torch.classes.load_library('libcustom_class.so')
-
-  class Foo(torch.nn.Module):
-      def __init__(self):
-          super().__init__()
-          self.stack = torch.classes.my_classes.MyStackClass(["just", "testing"])
-
-      def forward(self, s : str) -> str:
-          return self.stack.pop() + s
-
-  scripted_foo = torch.jit.script(Foo())
-
-  scripted_foo.save('foo.pt')
-  loaded = torch.jit.load('foo.pt')
-
-  print(loaded.stack.pop())
-
 .. code-block:: shell
 
   $ python ../export_attr.py
@@ -553,18 +298,10 @@ Once you've defined a custom C++ class, you can also use that class
 as an argument or return from a custom operator (i.e. free functions). Here's an
 example of how to do that:
 
-.. code-block:: cpp
-
-  c10::intrusive_ptr<MyStackClass<std::string>> manipulate_instance(const c10::intrusive_ptr<MyStackClass<std::string>>& instance) {
-    instance->pop();
-    return instance;
-  }
-
-  static auto instance_registry = torch::RegisterOperators().op(
-  torch::RegisterOperators::options()
-      .schema(
-          "foo::manipulate_instance(__torch__.torch.classes.my_classes.MyStackClass x) -> __torch__.torch.classes.my_classes.MyStackClass Y")
-      .catchAllKernel<decltype(manipulate_instance), &manipulate_instance>());
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN free_function
+  :end-before: END free_function
 
 Refer to the `custom op tutorial <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
 for more details on the registration API.
diff --git a/advanced_source/torch_script_custom_classes/CMakeLists.txt b/advanced_source/torch_script_custom_classes/CMakeLists.txt
new file mode 100644
index 00000000000..6a1eb3e87fa
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(infer)
+
+find_package(Torch REQUIRED)
+
+add_subdirectory(custom_class_project)
+
+# Define our library target
+add_executable(infer infer.cpp)
+set(CMAKE_CXX_STANDARD 14)
+# Link against LibTorch
+target_link_libraries(infer "${TORCH_LIBRARIES}")
+# This is where we link in our libcustom_class code, making our
+# custom class available in our binary.
+target_link_libraries(infer -Wl,--no-as-needed custom_class)
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt b/advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt
new file mode 100644
index 00000000000..bb3d41aa997
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(custom_class)
+
+find_package(Torch REQUIRED)
+
+# Define our library target
+add_library(custom_class SHARED class.cpp)
+set(CMAKE_CXX_STANDARD 14)
+# Link against LibTorch
+target_link_libraries(custom_class "${TORCH_LIBRARIES}")
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp b/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
new file mode 100644
index 00000000000..dd3480bee75
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
@@ -0,0 +1,145 @@
+// BEGIN class
+// This header is all you need to do the C++ portions of this
+// tutorial
+#include <torch/script.h>
+// This header is what defines the custom class registration
+// behavior specifically. script.h already includes this, but
+// we include it here so you know it exists in case you want
+// to look at the API or implementation.
+#include <torch/custom_class.h>
+
+#include <string>
+#include <vector>
+
+template <class T>
+struct MyStackClass : torch::CustomClassHolder {
+  std::vector<T> stack_;
+  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
+
+  void push(T x) {
+    stack_.push_back(x);
+  }
+  T pop() {
+    auto val = stack_.back();
+    stack_.pop_back();
+    return val;
+  }
+
+  c10::intrusive_ptr<MyStackClass> clone() const {
+    return c10::make_intrusive<MyStackClass>(stack_);
+  }
+
+  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
+    for (auto& elem : c->stack_) {
+      push(elem);
+    }
+  }
+};
+// END class
+
+#ifdef NO_PICKLE
+
+// BEGIN binding
+// Notice a few things:
+// - We pass the class to be registered as a template parameter to
+//   `torch::class_`. In this instance, we've passed the
+//   specialization of the MyStackClass class ``MyStackClass<std::string>``.
+//   In general, you cannot register a non-specialized template
+//   class. For non-templated classes, you can just pass the
+//   class name directly as the template parameter.
+// - The arguments passed to the constructor make up the "qualified name"
+//   of the class. In this case, the registered class will appear in
+//   Python and C++ as `torch.classes.my_classes.MyStackClass`. We call
+//   the first argument the "namespace" and the second argument the
+//   actual class name.
+static auto testStack =
+  torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
+      // The following line registers the contructor of our MyStackClass
+      // class that takes a single `std::vector<std::string>` argument,
+      // i.e. it exposes the C++ method `MyStackClass(std::vector<T> init)`.
+      // Currently, we do not support registering overloaded
+      // constructors, so for now you can only `def()` one instance of
+      // `torch::init`.
+      .def(torch::init<std::vector<std::string>>())
+      // The next line registers a stateless (i.e. no captures) C++ lambda
+      // function as a method. Note that a lambda function must take a
+      // `c10::intrusive_ptr<YourClass>` (or some const/ref version of that)
+      // as the first argument. Other arguments can be whatever you want.
+      .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+        return self->stack_.back();
+      })
+      // The following four lines expose methods of the MyStackClass<std::string>
+      // class as-is. `torch::class_` will automatically examine the
+      // argument and return types of the passed-in method pointers and
+      // expose these to Python and TorchScript accordingly. Finally, notice
+      // that we must take the *address* of the fully-qualified method name,
+      // i.e. use the unary `&` operator, due to C++ typing rules.
+      .def("push", &MyStackClass<std::string>::push)
+      .def("pop", &MyStackClass<std::string>::pop)
+      .def("clone", &MyStackClass<std::string>::clone)
+      .def("merge", &MyStackClass<std::string>::merge);
+// END binding
+
+#else
+
+// BEGIN pickle_binding
+static auto testStack =
+  torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
+      .def(torch::init<std::vector<std::string>>())
+      .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+        return self->stack_.back();
+      })
+      .def("push", &MyStackClass<std::string>::push)
+      .def("pop", &MyStackClass<std::string>::pop)
+      .def("clone", &MyStackClass<std::string>::clone)
+      .def("merge", &MyStackClass<std::string>::merge)
+      // class_<>::def_pickle allows you to define the serialization
+      // and deserialization methods for your C++ class.
+      // Currently, we only support passing stateless lambda functions
+      // as arguments to def_pickle
+      .def_pickle(
+            // __getstate__
+            // This function defines what data structure should be produced
+            // when we serialize an instance of this class. The function
+            // must take a single `self` argument, which is an intrusive_ptr
+            // to the instance of the object. The function can return
+            // any type that is supported as a return value of the TorchScript
+            // custom operator API. In this instance, we've chosen to return
+            // a std::vector<std::string> as the salient data to preserve
+            // from the class.
+            [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
+                -> std::vector<std::string> {
+              return self->stack_;
+            },
+            // __setstate__
+            // This function defines how to create a new instance of the C++
+            // class when we are deserializing. The function must take a
+            // single argument of the same type as the return value of
+            // `__getstate__`. The function must return an intrusive_ptr
+            // to a new instance of the C++ class, initialized however
+            // you would like given the serialized state.
+            [](std::vector<std::string> state)
+                -> c10::intrusive_ptr<MyStackClass<std::string>> {
+              // A convenient way to instantiate an object and get an
+              // intrusive_ptr to it is via `make_intrusive`. We use
+              // that here to allocate an instance of MyStackClass<std::string>
+              // and call the single-argument std::vector<std::string>
+              // constructor with the serialized state.
+              return c10::make_intrusive<MyStackClass<std::string>>(std::move(state));
+            });
+// END pickle_binding
+
+// BEGIN free_function
+c10::intrusive_ptr<MyStackClass<std::string>> manipulate_instance(const c10::intrusive_ptr<MyStackClass<std::string>>& instance) {
+  instance->pop();
+  return instance;
+}
+
+static auto instance_registry = torch::RegisterOperators().op(
+torch::RegisterOperators::options()
+    .schema(
+        "foo::manipulate_instance(__torch__.torch.classes.my_classes.MyStackClass x) -> __torch__.torch.classes.my_classes.MyStackClass Y")
+    .catchAllKernel<decltype(manipulate_instance), &manipulate_instance>());
+// END free_function
+
+#endif
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py b/advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py
new file mode 100644
index 00000000000..ba8448e4545
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/custom_test.py
@@ -0,0 +1,48 @@
+import torch
+
+# `torch.classes.load_library()` allows you to pass the path to your .so file
+# to load it in and make the custom C++ classes available to both Python and
+# TorchScript
+torch.classes.load_library("build/libcustom_class.so")
+# You can query the loaded libraries like this:
+print(torch.classes.loaded_libraries)
+# prints {'/custom_class_project/build/libcustom_class.so'}
+
+# We can find and instantiate our custom C++ class in python by using the
+# `torch.classes` namespace:
+#
+# This instantiation will invoke the MyStackClass(std::vector<T> init)
+# constructor we registered earlier
+s = torch.classes.my_classes.MyStackClass(["foo", "bar"])
+
+# We can call methods in Python
+s.push("pushed")
+assert s.pop() == "pushed"
+
+# Returning and passing instances of custom classes works as you'd expect
+s2 = s.clone()
+s.merge(s2)
+for expected in ["bar", "foo", "bar", "foo"]:
+    assert s.pop() == expected
+
+# We can also use the class in TorchScript
+# For now, we need to assign the class's type to a local in order to
+# annotate the type on the TorchScript function. This may change
+# in the future.
+MyStackClass = torch.classes.my_classes.MyStackClass
+
+
+@torch.jit.script
+def do_stacks(s: MyStackClass):  # We can pass a custom class instance
+    # We can instantiate the class
+    s2 = torch.classes.my_classes.MyStackClass(["hi", "mom"])
+    s2.merge(s)  # We can call a method on the class
+    # We can also return instances of the class
+    # from TorchScript function/methods
+    return s2.clone(), s2.top()
+
+
+stack, top = do_stacks(torch.classes.my_classes.MyStackClass(["wow"]))
+assert top == "wow"
+for expected in ["wow", "mom", "hi"]:
+    assert stack.pop() == expected
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py b/advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py
new file mode 100644
index 00000000000..9999d5c8183
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/export_attr.py
@@ -0,0 +1,21 @@
+# export_attr.py
+import torch
+
+torch.classes.load_library('build/libcustom_class.so')
+
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.stack = torch.classes.my_classes.MyStackClass(["just", "testing"])
+
+    def forward(self, s: str) -> str:
+        return self.stack.pop() + s
+
+
+scripted_foo = torch.jit.script(Foo())
+
+scripted_foo.save('foo.pt')
+loaded = torch.jit.load('foo.pt')
+
+print(loaded.stack.pop())
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/save.py b/advanced_source/torch_script_custom_classes/custom_class_project/save.py
new file mode 100644
index 00000000000..8826f95da7c
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/save.py
@@ -0,0 +1,18 @@
+import torch
+
+torch.classes.load_library('build/libcustom_class.so')
+
+
+class Foo(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, s: str) -> str:
+        stack = torch.classes.my_classes.MyStackClass(["hi", "mom"])
+        return stack.pop() + s
+
+
+scripted_foo = torch.jit.script(Foo())
+print(scripted_foo.graph)
+
+scripted_foo.save('foo.pt')
diff --git a/advanced_source/torch_script_custom_classes/infer.cpp b/advanced_source/torch_script_custom_classes/infer.cpp
new file mode 100644
index 00000000000..1ca5b002383
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/infer.cpp
@@ -0,0 +1,20 @@
+#include <torch/script.h>
+
+#include <iostream>
+#include <memory>
+
+int main(int argc, const char* argv[]) {
+  torch::jit::Module module;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    module = torch::jit::load("foo.pt");
+  }
+  catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return -1;
+  }
+
+  std::vector<c10::IValue> inputs = {"foobarbaz"};
+  auto output = module.forward(inputs).toString();
+  std::cout << output->string() << std::endl;
+}
diff --git a/advanced_source/torch_script_custom_classes/run.sh b/advanced_source/torch_script_custom_classes/run.sh
new file mode 100755
index 00000000000..52c59581309
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+rm -rf build
+rm -rf custom_class_project/build
+
+pushd custom_class_project
+  mkdir build
+  (cd build && cmake CXXFLAGS="-DNO_PICKLE" -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+  (cd build && make)
+  python custom_test.py
+  python save.py
+  ! python export_attr.py
+popd
+
+mkdir build
+(cd build && cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+(cd build && make)
+mv custom_class_project/foo.pt build/foo.pt
+(cd build && ./infer)
diff --git a/advanced_source/torch_script_custom_classes/run2.sh b/advanced_source/torch_script_custom_classes/run2.sh
new file mode 100755
index 00000000000..d4ef0101a83
--- /dev/null
+++ b/advanced_source/torch_script_custom_classes/run2.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+rm -rf build
+rm -rf custom_class_project/build
+
+pushd custom_class_project
+  mkdir build
+  (cd build && cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..)
+  (cd build && make)
+  python export_attr.py
+popd
diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index 33c80a80778..d5620387451 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -212,7 +212,7 @@ To now build our operator, we can run the following commands from our
 
   $ mkdir build
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)') ..
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
   -- The C compiler identification is GNU 5.4.0
   -- The CXX compiler identification is GNU 5.4.0
   -- Check for working C compiler: /usr/bin/cc
@@ -609,7 +609,7 @@ At this point, we should be able to build the application:
 
   $ mkdir build
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
   -- The C compiler identification is GNU 5.4.0
   -- The CXX compiler identification is GNU 5.4.0
   -- Check for working C compiler: /usr/bin/cc
@@ -752,7 +752,7 @@ library. In the top level ``example_app`` directory:
 
   $ mkdir build
   $ cd build
-  $ cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch ..
+  $ cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
   -- The C compiler identification is GNU 5.4.0
   -- The CXX compiler identification is GNU 5.4.0
   -- Check for working C compiler: /usr/bin/cc

From 32e5407cfe848e9c50d62c3bfa69487af8b64a72 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 14 Jul 2020 11:11:03 -0400
Subject: [PATCH 18/19] Update torch_script_custom_classes to use TORCH_LIBRARY
 (#1062)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 .../torch_script_custom_classes.rst           |  47 +++--
 .../custom_class_project/class.cpp            | 165 ++++++++----------
 2 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/advanced_source/torch_script_custom_classes.rst b/advanced_source/torch_script_custom_classes.rst
index c5d34b13100..be13759cdc6 100644
--- a/advanced_source/torch_script_custom_classes.rst
+++ b/advanced_source/torch_script_custom_classes.rst
@@ -25,12 +25,14 @@ There are several things to note:
   with your custom class.
 - Notice that whenever we are working with instances of the custom
   class, we do it via instances of ``c10::intrusive_ptr<>``. Think of ``intrusive_ptr``
-  as a smart pointer like ``std::shared_ptr``. The reason for using this smart pointer
-  is to ensure consistent lifetime management of the object instances between languages
-  (C++, Python and TorchScript).
+  as a smart pointer like ``std::shared_ptr``, but the reference count is stored
+  directly in the object, as opposed to a separate metadata block (as is done in
+  ``std::shared_ptr``.  ``torch::Tensor`` internally uses the same pointer type;
+  and custom classes have to also use this pointer type so that we can
+  consistently manage different object types.
 - The second thing to notice is that the user-defined class must inherit from
-  ``torch::CustomClassHolder``. This ensures that everything is set up to handle
-  the lifetime management system previously mentioned.
+  ``torch::CustomClassHolder``. This ensures that the custom class has space to
+  store the reference count.
 
 Now let's take a look at how we will make this class visible to TorchScript, a process called
 *binding* the class:
@@ -39,6 +41,9 @@ Now let's take a look at how we will make this class visible to TorchScript, a p
   :language: cpp
   :start-after: BEGIN binding
   :end-before: END binding
+  :append:
+      ;
+    }
 
 
 
@@ -269,13 +274,13 @@ the special ``def_pickle`` method on ``class_``.
   `read more <https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/docs/serialization.md#getstate-and-setstate>`_
   about how we use these methods.
 
-Here is an example of how we can update the registration code for our
-``MyStackClass`` class to include serialization methods:
+Here is an example of the ``def_pickle`` call we can add to the registration of
+``MyStackClass`` to include serialization methods:
 
 .. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
   :language: cpp
-  :start-after: BEGIN pickle_binding
-  :end-before: END pickle_binding
+  :start-after: BEGIN def_pickle
+  :end-before: END def_pickle
 
 .. note::
   We take a different approach from pybind11 in the pickle API. Whereas pybind11
@@ -295,14 +300,22 @@ Defining Custom Operators that Take or Return Bound C++ Classes
 ---------------------------------------------------------------
 
 Once you've defined a custom C++ class, you can also use that class
-as an argument or return from a custom operator (i.e. free functions). Here's an
-example of how to do that:
+as an argument or return from a custom operator (i.e. free functions). Suppose
+you have the following free function:
 
 .. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
   :language: cpp
   :start-after: BEGIN free_function
   :end-before: END free_function
 
+You can register it running the following code inside your ``TORCH_LIBRARY``
+block:
+
+.. literalinclude:: ../advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+  :language: cpp
+  :start-after: BEGIN def_free
+  :end-before: END def_free
+
 Refer to the `custom op tutorial <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
 for more details on the registration API.
 
@@ -321,12 +334,12 @@ Once this is done, you can use the op like the following example:
 .. note::
 
   Registration of an operator that takes a C++ class as an argument requires that
-  the custom class has already been registered. This is fine if your op is
-  registered after your class in a single compilation unit, however, if your
-  class is registered in a separate compilation unit from the op you will need
-  to enforce that dependency. One way to do this is to wrap the class registration
-  in a `Meyer's singleton <https://stackoverflow.com/q/1661529>`_, which can be
-  called from the compilation unit that does the operator registration.
+  the custom class has already been registered.  You can enforce this by
+  making sure the custom class registration and your free function definitions
+  are in the same ``TORCH_LIBRARY`` block, and that the custom class
+  registration comes first.  In the future, we may relax this requirement,
+  so that these can be registered in any order.
+
 
 Conclusion
 ----------
diff --git a/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp b/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
index dd3480bee75..c5849cef102 100644
--- a/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
+++ b/advanced_source/torch_script_custom_classes/custom_class_project/class.cpp
@@ -37,7 +37,12 @@ struct MyStackClass : torch::CustomClassHolder {
 };
 // END class
 
-#ifdef NO_PICKLE
+// BEGIN free_function
+c10::intrusive_ptr<MyStackClass<std::string>> manipulate_instance(const c10::intrusive_ptr<MyStackClass<std::string>>& instance) {
+  instance->pop();
+  return instance;
+}
+// END free_function
 
 // BEGIN binding
 // Notice a few things:
@@ -52,94 +57,76 @@ struct MyStackClass : torch::CustomClassHolder {
 //   Python and C++ as `torch.classes.my_classes.MyStackClass`. We call
 //   the first argument the "namespace" and the second argument the
 //   actual class name.
-static auto testStack =
-  torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
-      // The following line registers the contructor of our MyStackClass
-      // class that takes a single `std::vector<std::string>` argument,
-      // i.e. it exposes the C++ method `MyStackClass(std::vector<T> init)`.
-      // Currently, we do not support registering overloaded
-      // constructors, so for now you can only `def()` one instance of
-      // `torch::init`.
-      .def(torch::init<std::vector<std::string>>())
-      // The next line registers a stateless (i.e. no captures) C++ lambda
-      // function as a method. Note that a lambda function must take a
-      // `c10::intrusive_ptr<YourClass>` (or some const/ref version of that)
-      // as the first argument. Other arguments can be whatever you want.
-      .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-        return self->stack_.back();
-      })
-      // The following four lines expose methods of the MyStackClass<std::string>
-      // class as-is. `torch::class_` will automatically examine the
-      // argument and return types of the passed-in method pointers and
-      // expose these to Python and TorchScript accordingly. Finally, notice
-      // that we must take the *address* of the fully-qualified method name,
-      // i.e. use the unary `&` operator, due to C++ typing rules.
-      .def("push", &MyStackClass<std::string>::push)
-      .def("pop", &MyStackClass<std::string>::pop)
-      .def("clone", &MyStackClass<std::string>::clone)
-      .def("merge", &MyStackClass<std::string>::merge);
+TORCH_LIBRARY(my_classes, m) {
+  m.class_<MyStackClass<std::string>>("MyStackClass")
+    // The following line registers the contructor of our MyStackClass
+    // class that takes a single `std::vector<std::string>` argument,
+    // i.e. it exposes the C++ method `MyStackClass(std::vector<T> init)`.
+    // Currently, we do not support registering overloaded
+    // constructors, so for now you can only `def()` one instance of
+    // `torch::init`.
+    .def(torch::init<std::vector<std::string>>())
+    // The next line registers a stateless (i.e. no captures) C++ lambda
+    // function as a method. Note that a lambda function must take a
+    // `c10::intrusive_ptr<YourClass>` (or some const/ref version of that)
+    // as the first argument. Other arguments can be whatever you want.
+    .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+      return self->stack_.back();
+    })
+    // The following four lines expose methods of the MyStackClass<std::string>
+    // class as-is. `torch::class_` will automatically examine the
+    // argument and return types of the passed-in method pointers and
+    // expose these to Python and TorchScript accordingly. Finally, notice
+    // that we must take the *address* of the fully-qualified method name,
+    // i.e. use the unary `&` operator, due to C++ typing rules.
+    .def("push", &MyStackClass<std::string>::push)
+    .def("pop", &MyStackClass<std::string>::pop)
+    .def("clone", &MyStackClass<std::string>::clone)
+    .def("merge", &MyStackClass<std::string>::merge)
 // END binding
+#ifndef NO_PICKLE
+// BEGIN def_pickle
+    // class_<>::def_pickle allows you to define the serialization
+    // and deserialization methods for your C++ class.
+    // Currently, we only support passing stateless lambda functions
+    // as arguments to def_pickle
+    .def_pickle(
+          // __getstate__
+          // This function defines what data structure should be produced
+          // when we serialize an instance of this class. The function
+          // must take a single `self` argument, which is an intrusive_ptr
+          // to the instance of the object. The function can return
+          // any type that is supported as a return value of the TorchScript
+          // custom operator API. In this instance, we've chosen to return
+          // a std::vector<std::string> as the salient data to preserve
+          // from the class.
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
+              -> std::vector<std::string> {
+            return self->stack_;
+          },
+          // __setstate__
+          // This function defines how to create a new instance of the C++
+          // class when we are deserializing. The function must take a
+          // single argument of the same type as the return value of
+          // `__getstate__`. The function must return an intrusive_ptr
+          // to a new instance of the C++ class, initialized however
+          // you would like given the serialized state.
+          [](std::vector<std::string> state)
+              -> c10::intrusive_ptr<MyStackClass<std::string>> {
+            // A convenient way to instantiate an object and get an
+            // intrusive_ptr to it is via `make_intrusive`. We use
+            // that here to allocate an instance of MyStackClass<std::string>
+            // and call the single-argument std::vector<std::string>
+            // constructor with the serialized state.
+            return c10::make_intrusive<MyStackClass<std::string>>(std::move(state));
+          });
+// END def_pickle
+#endif // NO_PICKLE
 
-#else
-
-// BEGIN pickle_binding
-static auto testStack =
-  torch::class_<MyStackClass<std::string>>("my_classes", "MyStackClass")
-      .def(torch::init<std::vector<std::string>>())
-      .def("top", [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-        return self->stack_.back();
-      })
-      .def("push", &MyStackClass<std::string>::push)
-      .def("pop", &MyStackClass<std::string>::pop)
-      .def("clone", &MyStackClass<std::string>::clone)
-      .def("merge", &MyStackClass<std::string>::merge)
-      // class_<>::def_pickle allows you to define the serialization
-      // and deserialization methods for your C++ class.
-      // Currently, we only support passing stateless lambda functions
-      // as arguments to def_pickle
-      .def_pickle(
-            // __getstate__
-            // This function defines what data structure should be produced
-            // when we serialize an instance of this class. The function
-            // must take a single `self` argument, which is an intrusive_ptr
-            // to the instance of the object. The function can return
-            // any type that is supported as a return value of the TorchScript
-            // custom operator API. In this instance, we've chosen to return
-            // a std::vector<std::string> as the salient data to preserve
-            // from the class.
-            [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
-                -> std::vector<std::string> {
-              return self->stack_;
-            },
-            // __setstate__
-            // This function defines how to create a new instance of the C++
-            // class when we are deserializing. The function must take a
-            // single argument of the same type as the return value of
-            // `__getstate__`. The function must return an intrusive_ptr
-            // to a new instance of the C++ class, initialized however
-            // you would like given the serialized state.
-            [](std::vector<std::string> state)
-                -> c10::intrusive_ptr<MyStackClass<std::string>> {
-              // A convenient way to instantiate an object and get an
-              // intrusive_ptr to it is via `make_intrusive`. We use
-              // that here to allocate an instance of MyStackClass<std::string>
-              // and call the single-argument std::vector<std::string>
-              // constructor with the serialized state.
-              return c10::make_intrusive<MyStackClass<std::string>>(std::move(state));
-            });
-// END pickle_binding
-
-// BEGIN free_function
-c10::intrusive_ptr<MyStackClass<std::string>> manipulate_instance(const c10::intrusive_ptr<MyStackClass<std::string>>& instance) {
-  instance->pop();
-  return instance;
+// BEGIN def_free
+    m.def(
+      "foo::manipulate_instance(__torch__.torch.classes.my_classes.MyStackClass x) -> __torch__.torch.classes.my_classes.MyStackClass Y",
+      manipulate_instance
+    );
+// END def_free
 }
-
-static auto instance_registry = torch::RegisterOperators().op(
-torch::RegisterOperators::options()
-    .schema(
-        "foo::manipulate_instance(__torch__.torch.classes.my_classes.MyStackClass x) -> __torch__.torch.classes.my_classes.MyStackClass Y")
-    .catchAllKernel<decltype(manipulate_instance), &manipulate_instance>());
-// END free_function
-
-#endif

From d823c9e2b82b936ae0c2151876b906df0a9a5f2d Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Tue, 14 Jul 2020 10:23:40 -0700
Subject: [PATCH 19/19] Add Graph Mode Dynamic Quant tutorial

Summary:
Tutorial to demonstrate graph mode dynamic quant on BERT model.
Currently not directly runnable as it requires to download glue dataset and fine-tuned model
---
 .../graph_mode_dynamic_bert_tutorial.rst      | 559 ++++++++++++++++++
 1 file changed, 559 insertions(+)
 create mode 100644 prototype_source/graph_mode_dynamic_bert_tutorial.rst

diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst
new file mode 100644
index 00000000000..2a296ccfa6b
--- /dev/null
+++ b/prototype_source/graph_mode_dynamic_bert_tutorial.rst
@@ -0,0 +1,559 @@
+(prototype) Graph Mode Dynamic Quantization on BERT
+==============================================
+
+
+**Author**: `Supriya Rao <https://github.com/supriyar>`_
+
+Introduction
+------------
+
+This tutorial introduces the steps to do post training Dynamic Quantization with Graph Mode Quantization. Dynamic quantization converts a float model to a quantized model with static int8 data types for the weights and dynamic quantization for the activations. The activations are quantized dynamically (per batch) to int8 while the weights are statically quantized to int8. Graph Mode Quantization flow operates on the model graph and requires minimal user intervention to quantize the model. To be able to use graph mode, the float model needs to be either traced or scripted first.
+
+Advantages of graph mode quantization are:
+
+- In graph mode, we can inspect the code that is executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations.
+- Simple quantization flow, minimal manual steps.
+- Unlocks the possibility of doing higher level optimizations like automatic precision selection.
+
+For additional details on Graph Mode Quantization please refer to the `Graph Mode Static Quantization Tutorial <https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html>`_.
+
+tl;dr The Graph Mode Dynamic `Quantization API <https://pytorch.org/docs/master/quantization.html#torch-quantization>`_:
+
+.. code:: python
+
+    import torch
+    from torch.quantization import per_channel_dynamic_qconfig
+    from torch.quantization import quantize_dynamic_jit
+
+    ts_model = torch.jit.script(float_model) # or torch.jit.trace(float_model, input)
+
+    quantized = quantize_dynamic_jit(ts_model, {'': per_channel_dynamic_qconfig})
+
+1. Quantizing BERT Model
+------------------------
+
+The installaion steps and details about the model are identical to the steps in the Eager Mode Tutorial. Please refer to the tutorial `here <https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html#install-pytorch-and-huggingface-transformers>`_ for more details.
+
+1.1 Setup
+^^^^^^^^^
+Once all the necesessary packages are downloaded and installed we setup the code. We first start with the necessary imports and setup for the model.
+
+.. code:: python
+
+    from __future__ import absolute_import, division, print_function
+
+    import logging
+    import numpy as np
+    import os
+    import random
+    import sys
+    import time
+    import torch
+
+    from argparse import Namespace
+    from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                                  TensorDataset)
+    from tqdm import tqdm
+    from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,)
+    from transformers import glue_compute_metrics as compute_metrics
+    from transformers import glue_output_modes as output_modes
+    from transformers import glue_processors as processors
+    from transformers import glue_convert_examples_to_features as convert_examples_to_features
+    from torch.quantization import per_channel_dynamic_qconfig
+    from torch.quantization import quantize_dynamic_jit
+
+    global_rng = random.Random()
+
+    def ids_tensor(shape, vocab_size, rng=None, name=None):
+        #  Creates a random int32 tensor of the shape within the vocab size
+        if rng is None:
+            rng = global_rng
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long, device='cpu').view(shape).contiguous()
+
+    # Setup logging
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.WARN)
+
+    logging.getLogger("transformers.modeling_utils").setLevel(
+       logging.WARN)  # Reduce logging
+
+    print(torch.__version__)
+
+    torch.set_num_threads(1)
+    print(torch.__config__.parallel_info())
+
+1.2 Download GLUE dataset
+^^^^^^^^^^^^^^^^^^^^^^^^^
+Before running MRPC tasks we download the GLUE data by running this script and unpack it to a directory glue_data.
+
+.. code:: shell
+
+    python download_glue_data.py --data_dir='glue_data' --tasks='MRPC'
+
+1.3 Set global BERT configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+To run this experiment we first need a fine tuned BERT model. We provide the fined-tuned BERT model for MRPC task `here <https://download.pytorch.org/tutorial/MRPC.zip>`_. To save time, you can download the model file (~400 MB) directly into your local folder $OUT_DIR.
+
+
+.. code:: python
+
+    configs = Namespace()
+
+    # The output directory for the fine-tuned model, $OUT_DIR.
+    configs.output_dir = "./MRPC/"
+
+    # The data directory for the MRPC task in the GLUE benchmark, $GLUE_DIR/$TASK_NAME.
+    configs.data_dir = "./glue_data/MRPC"
+
+    # The model name or path for the pre-trained model.
+    configs.model_name_or_path = "bert-base-uncased"
+    # The maximum length of an input sequence
+    configs.max_seq_length = 128
+
+    # Prepare GLUE task.
+    configs.task_name = "MRPC".lower()
+    configs.processor = processors[configs.task_name]()
+    configs.output_mode = output_modes[configs.task_name]
+    configs.label_list = configs.processor.get_labels()
+    configs.model_type = "bert".lower()
+    configs.do_lower_case = True
+
+    # Set the device, batch size, topology, and caching flags.
+    configs.device = "cpu"
+    configs.per_gpu_eval_batch_size = 8
+    configs.n_gpu = 0
+    configs.local_rank = -1
+    configs.overwrite_cache = False
+
+    # Set random seed for reproducibility.
+    def set_seed(seed):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+    set_seed(42)
+
+    tokenizer = BertTokenizer.from_pretrained(
+        configs.output_dir, do_lower_case=configs.do_lower_case)
+
+    model = BertForSequenceClassification.from_pretrained(configs.output_dir, torchscript=True)
+    model.to(configs.device)
+
+1.4 Quantizing BERT model with Graph Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1.4.1 Script/Trace the model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The input for graph mode quantization is a TorchScript model, so you'll need to either script or trace the model first. Currently, scripting the BERT model is not supported so we trace the model here.
+
+We first identify the inputs to be passed to the model. Here, we trace the model with the largest possible input size that will be passed during the evaluation.
+We choose a batch size of 8 and sequence lenght of 128 based on the input sizes passed in during the evaluation step below. Using the max possible shape during inference while tracing is a limitation of the huggingface BERT model as mentioned `here <https://huggingface.co/transformers/v2.3.0/torchscript.html#dummy-inputs-and-standard-lengths>`_.
+
+We trace the model using ``torch.jit.trace``.
+
+.. code:: python
+
+    input_ids = ids_tensor([8, 128], 2)
+    token_type_ids = ids_tensor([8, 128], 2)
+    attention_mask = ids_tensor([8, 128], vocab_size=2)
+    dummy_input = (input_ids, attention_mask, token_type_ids)
+    traced_model = torch.jit.trace(model, dummy_input)
+
+1.4.2 Specify qconfig_dict
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code::
+
+    qconfig_dict = {'': per_channel_dynamic_qconfig}
+
+qconfig is a named tuple of the observers for activation and weight. For dynamic quantization we use a dummy activation observer to mimic the dynamic quantization process that happens in the operator during runtime. For the weight tensors we recommend using per-channel quantization which helps improve the final accuracy.
+``qconfig_dict`` is a dictionary with names of sub modules as key and qconfig for that module as value, empty key means the qconfig will be applied to whole model unless it’s overwritten by more specific configurations, the qconfig for each module is either found in the dictionary or fallback to the qconfig of parent module.
+
+Right now qconfig_dict is the only way to configure how the model is quantized, and it is done in the granularity of module, that is, we only support one type of qconfig for each module, and the qconfig for sub module will override the qconfig for parent module. For example, if we have
+
+.. code::
+
+    qconfig = {
+        '' : qconfig_global,
+        'sub' : qconfig_sub,
+        'sub.fc1' : qconfig_fc,
+        'sub.fc2': None
+    }
+
+Module ``sub.fc1`` will be configured with ``qconfig_fc``, and all other child modules in ``sub`` will be configured with ``qconfig_sub`` and ``sub.fc2`` will not be quantized. All other modules in the model will be quantized with qconfig_global
+
+.. code:: python
+
+    qconfig_dict = {'': per_channel_dynamic_qconfig}
+
+1.4.3 Quantize the model (one-line API)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We call the one line API (similar to eager mode) to perform quantization as follows.
+
+.. code:: python
+
+    quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict)
+
+2. Evaluation
+-------------
+
+We reuse the tokenize and evaluation function from Huggingface.
+
+.. code:: python
+
+    def evaluate(args, model, tokenizer, prefix=""):
+        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+
+        results = {}
+        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+            if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+                os.makedirs(eval_output_dir)
+
+            args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+            # Note that DistributedSampler samples randomly
+            eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+            eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+            # multi-gpu eval
+            if args.n_gpu > 1:
+                model = torch.nn.DataParallel(model)
+
+            # Eval!
+            logger.info("***** Running evaluation {} *****".format(prefix))
+            logger.info("  Num examples = %d", len(eval_dataset))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+            nb_eval_steps = 0
+            preds = None
+            out_label_ids = None
+            for batch in tqdm(eval_dataloader, desc="Evaluating"):
+                model.eval()
+                batch = tuple(t.to(args.device) for t in batch)
+
+                with torch.no_grad():
+                    inputs = {'input_ids':      batch[0],
+                              'attention_mask': batch[1]}
+                    labels = batch[3]
+                    if args.model_type != 'distilbert':
+                        inputs['input'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                    outputs = model(**inputs)
+                    logits = outputs[0]
+                nb_eval_steps += 1
+                if preds is None:
+                    preds = logits.detach().cpu().numpy()
+                    out_label_ids = labels.detach().cpu().numpy()
+                else:
+                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
+
+            if args.output_mode == "classification":
+                preds = np.argmax(preds, axis=1)
+            elif args.output_mode == "regression":
+                preds = np.squeeze(preds)
+            result = compute_metrics(eval_task, preds, out_label_ids)
+            results.update(result)
+
+            output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results {} *****".format(prefix))
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+        return results
+
+    def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+        if args.local_rank not in [-1, 0] and not evaluate:
+            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+        processor = processors[task]()
+        output_mode = output_modes[task]
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+            'dev' if evaluate else 'train',
+            list(filter(None, args.model_name_or_path.split('/'))).pop(),
+            str(args.max_seq_length),
+            str(task)))
+        if os.path.exists(cached_features_file) and not args.overwrite_cache:
+            logger.info("Loading features from cached file %s", cached_features_file)
+            features = torch.load(cached_features_file)
+        else:
+            logger.info("Creating features from dataset file at %s", args.data_dir)
+            label_list = processor.get_labels()
+            if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+                # HACK(label indices are swapped in RoBERTa pretrained model)
+                label_list[1], label_list[2] = label_list[2], label_list[1]
+            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+            features = convert_examples_to_features(examples,
+                                                    tokenizer,
+                                                    label_list=label_list,
+                                                    max_length=args.max_seq_length,
+                                                    output_mode=output_mode,)
+            if args.local_rank in [-1, 0]:
+                logger.info("Saving features into cached file %s", cached_features_file)
+                torch.save(features, cached_features_file)
+
+        if args.local_rank == 0 and not evaluate:
+            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        if output_mode == "classification":
+            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+        return dataset
+
+    def time_model_evaluation(model, configs, tokenizer):
+        eval_start_time = time.time()
+        result = evaluate(configs, model, tokenizer, prefix="")
+        eval_end_time = time.time()
+        eval_duration_time = eval_end_time - eval_start_time
+        print(result)
+        print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))
+
+
+2.1 Check Model Size
+^^^^^^^^^^^^^^^^^^^^
+
+We print the model size to account for wins from quantization
+
+.. code:: python
+
+    def print_size_of_model(model):
+        if isinstance(model, torch.jit.RecursiveScriptModule):
+            torch.jit.save(model, "temp.p")
+        else:
+            torch.jit.save(torch.jit.script(model), "temp.p")
+        print('Size (MB):', os.path.getsize("temp.p")/1e6)
+        os.remove('temp.p')
+
+    print("Size of model before quantization")
+    print_size_of_model(traced_model)
+    print("Size of model after quantization")
+
+    print_size_of_model(quantized_model)
+
+.. code::
+
+    Size of model before quantization
+    Size (MB): 438.242141
+    Size of model after quantization
+    Size (MB): 184.354759
+
+2.2 Run the evaluation
+^^^^^^^^^^^^^^^^^^^^^^
+We evaluate the FP32 and quantized model and compare the F1 score. Note that the performance numbers below are on a dev machine and they would likely improve on a production server.
+
+.. code:: python
+
+    time_model_evaluation(traced_model, configs, tokenizer)
+    time_model_evaluation(quantized_model, configs, tokenizer)
+
+.. code::
+
+    FP32 model results -
+    'f1': 0.901
+    Time taken - 188.0s
+
+    INT8 model results -
+    'f1': 0.902
+    Time taken - 157.4s
+
+3. Debugging the Quantized Model
+--------------------------------
+
+We can debug the quantized model by passing in the debug option.
+
+.. code::
+
+    quantized_model = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True)
+
+If debug is set to True:
+
+- We can access the attributes of the quantized model the same way as in a torchscript model, e.g. model.fc1.weight (might be harder if you use a module list or sequential).
+- The arithmetic operations all occur in floating point with the numerics being identical to the final quantized model, allowing for debugging.
+
+.. code:: python
+
+    quantized_model_debug = quantize_dynamic_jit(traced_model, qconfig_dict, debug=True)
+
+Calling ``quantize_dynamic_jit`` is equivalent to calling ``prepare_dynamic_jit`` followed by ``convert_dynamic_jit``. Usage of the one-line API is recommended. But if you wish to debug or analyze the model after each step, the multi-line API comes into use.
+
+3.1. Evaluate the Debug Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    # Evaluate the debug model
+    time_model_evaluation(quantized_model_debug, configs, tokenizer)
+
+.. code::
+
+    Size (MB): 438.406429
+
+    INT8 (debug=True) model results -
+    'f1': 0.897
+
+Note that the accuracy of the debug version is close to, but not exactly the same as the non-debug version as the debug version uses floating point ops to emulate quantized ops and the numerics match is approximate.
+This is the case only for per-channel quantization (we are working on improving this). Per-tensor quantization (using default_dynamic_qconfig) has exact numerics match between debug and non-debug version.
+
+.. code:: python
+
+    print(str(quantized_model_debug.graph))
+
+Snippet of the graph printed -
+
+.. code::
+
+    %111 : Tensor = prim::GetAttr[name="bias"](%110)
+    %112 : Tensor = prim::GetAttr[name="weight"](%110)
+    %113 : Float(768:1) = prim::GetAttr[name="4_scale_0"](%110)
+    %114 : Int(768:1) = prim::GetAttr[name="4_zero_point_0"](%110)
+    %115 : int = prim::GetAttr[name="4_axis_0"](%110)
+    %116 : int = prim::GetAttr[name="4_scalar_type_0"](%110)
+    %4.quant.6 : Tensor = aten::quantize_per_channel(%112, %113, %114, %115, %116)
+    %4.dequant.6 : Tensor = aten::dequantize(%4.quant.6)
+    %1640 : bool = prim::Constant[value=1]()
+    %input.5.scale.1 : float, %input.5.zero_point.1 : int = aten::_choose_qparams_per_tensor(%input.5, %1640)
+    %input.5.quant.1 : Tensor = aten::quantize_per_tensor(%input.5, %input.5.scale.1, %input.5.zero_point.1, %74)
+    %input.5.dequant.1 : Float(8:98304, 128:768, 768:1) = aten::dequantize(%input.5.quant.1)
+    %119 : Tensor = aten::linear(%input.5.dequant.1, %4.dequant.6, %111)
+
+We can see that there is no ``quantized::linear_dynamic`` in the model, but the numerically equivalent pattern of ``aten::_choose_qparams_per_tensor`` - ``aten::quantize_per_tensor`` - ``aten::dequantize`` - ``aten::linear``.
+
+.. code:: python
+
+    # Get the size of the debug model
+    print_size_of_model(quantized_model_debug)
+
+.. code::
+
+    Size (MB): 438.406429
+
+Size of the debug model is the close to the floating point model because all the weights are in float and not yet quantized and frozen, this allows people to inspect the weight.
+You may access the weight attributes directly in the torchscript model. Accessing the weight in the debug model is the same as accessing the weight in a TorchScript model:
+
+.. code:: python
+
+    print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.weight)
+
+.. code::
+
+    tensor([[-0.0157,  0.0257, -0.0269,  ...,  0.0158,  0.0764,  0.0548],
+            [-0.0325,  0.0345, -0.0423,  ..., -0.0528,  0.1382,  0.0069],
+            [ 0.0106,  0.0335,  0.0113,  ..., -0.0275,  0.0253, -0.0457],
+            ...,
+            [-0.0090,  0.0512,  0.0555,  ...,  0.0277,  0.0543, -0.0539],
+            [-0.0195,  0.0943,  0.0619,  ..., -0.1040,  0.0598,  0.0465],
+            [ 0.0009, -0.0949,  0.0097,  ..., -0.0183, -0.0511, -0.0085]],
+            grad_fn=<CloneBackward>)
+
+Accessing the scale and zero_point for the corresponding weight can be done as follows -
+
+.. code:: python
+
+    print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_scale_0'))
+    print(quantized_model.bert.encoder.layer._c.getattr('0').attention.self.query.getattr('4_zero_point_0'))
+
+Since we use per-channel quantization, we get per-channel scales tensor.
+
+.. code::
+
+    tensor([0.0009, 0.0011, 0.0010, 0.0011, 0.0034, 0.0013, 0.0010, 0.0010, 0.0013,
+            0.0012, 0.0011, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0009, 0.0015,
+            0.0016, 0.0036, 0.0012, 0.0009, 0.0010, 0.0014, 0.0008, 0.0008, 0.0008,
+            ...,
+            0.0019, 0.0023, 0.0013, 0.0018, 0.0012, 0.0031, 0.0015, 0.0013, 0.0014,
+            0.0022, 0.0011, 0.0024])
+
+Zero-point tensor -
+
+.. code::
+
+    tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            ..,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+           dtype=torch.int32)
+
+4. Comparing Results with Eager Mode
+------------------------------------
+
+Following results show the F1 score and model size for Eager Mode Quantization of the same model by following the steps mentioned in the `tutorial <https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html#evaluate-the-inference-accuracy-and-time>`_. Results show that Eager and Graph Mode Quantization on the model produce identical results.
+
+.. code::
+
+    FP32 model results -
+    Size (MB): 438.016605
+    'f1': 0.901
+
+    INT8 model results -
+    Size (MB): 182.878029
+    'f1': 0.902
+
+5. Benchmarking the Model
+-------------------------
+
+We benchmark the model with dummy input and compare the Float model with Eager and Graph Mode Quantized Model on a production server machine.
+
+.. code:: python
+
+    def benchmark(model):
+        model = torch.jit.load(model)
+        model.eval()
+        torch.set_num_threads(1)
+        input_ids = ids_tensor([8, 128], 2)
+        token_type_ids = ids_tensor([8, 128], 2)
+        attention_mask = ids_tensor([8, 128], vocab_size=2)
+        elapsed = 0
+        for _i in range(50):
+            start = time.time()
+            output = model(input_ids, token_type_ids, attention_mask)
+            end = time.time()
+            elapsed = elapsed + (end - start)
+        print('Elapsed time: ', (elapsed / 50), ' s')
+        return
+    print("Running benchmark for Float model")
+    benchmark(args.jit_model_path_float)
+    print("Running benchmark for Eager Mode Quantized model")
+    benchmark(args.jit_model_path_eager)
+    print("Running benchmark for Graph Mode Quantized model")
+    benchmark(args.jit_model_path_graph)
+
+.. code::
+
+    Running benchmark for Float model
+    Elapsed time: 4.49 s
+    Running benchmark for Eager Mode Quantized model
+    Elapsed time: 2.67 s
+    Running benchmark for Graph Mode Quantized model
+    Elapsed time: 2.69 s
+    As we can see both graph mode and eager mode quantized model have a similar speed up over the floating point model.
+
+Conclusion
+----------
+
+In this tutorial, we demonstrated how to convert a well-known state-of-the-art NLP model like BERT into dynamic quantized model using graph mode with same performance as eager mode.
+Dynamic quantization can reduce the size of the model while only having a limited implication on accuracy.
+
+Thanks for reading! As always, we welcome any feedback, so please create an issue `here <https://github.com/pytorch/pytorch/issues>`_ if you have any.