pytorch · jlin27 · Dec 6, 2019 · Dec 5, 2019 · Dec 6, 2019
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.py b/intermediate_source/dynamic_quantization_bert_tutorial.py
@@ -35,22 +35,20 @@
 #    are quantized dynamically (per batch) to int8 when the weights are
 #    quantized to int8.
 #
-# In PyTorch, we have ``torch.quantization.quantize_dynamic`` API support
-# (https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic),
-# which replaces specified modules with dynamic weight-only quantized
+# In PyTorch, we have `torch.quantization.quantize_dynamic API
+# <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`_
+# ,which replaces specified modules with dynamic weight-only quantized
 # versions and output the quantized model.
 #
 # -  We demonstrate the accuracy and inference performance results on the
-#    Microsoft Research Paraphrase Corpus (MRPC) task
-#    (https://www.microsoft.com/en-us/download/details.aspx?id=52398) in
-#    the General Language Understanding Evaluation benchmark (GLUE)
-#    (https://gluebenchmark.com/). The MRPC (Dolan and Brockett, 2005) is
+#    `Microsoft Research Paraphrase Corpus (MRPC) task <https://www.microsoft.com/en-us/download/details.aspx?id=52398>`_
+#    in the General Language Understanding Evaluation benchmark `(GLUE)
+#    <https://gluebenchmark.com/>`_. The MRPC (Dolan and Brockett, 2005) is
 #    a corpus of sentence pairs automatically extracted from online news
 #    sources, with human annotations of whether the sentences in the pair
 #    are semantically equivalent. Because the classes are imbalanced (68%
 #    positive, 32% negative), we follow common practice and report both
-#    accuracy and F1 score
-#    (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html).
+#    accuracy and `F1 score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html>`_
 #    MRPC is a common NLP task for language pair classification, as shown
 #    below.
 #
@@ -78,8 +76,10 @@
 #
 # To start this tutorial, let’s first follow the installation instructions
 # in PyTorch and HuggingFace Github Repo: -
-# https://github.com/pytorch/pytorch/#installation -
-# https://github.com/huggingface/transformers#installation
+#
+# * https://github.com/pytorch/pytorch/#installation -
+#
+# * https://github.com/huggingface/transformers#installation
 #
 # In addition, we also install ``sklearn`` package, as we will reuse its
 # built-in F1 score calculation helper function.
@@ -93,8 +93,8 @@
 ######################################################################
 # Because we will be using the experimental parts of the PyTorch, it is
 # recommended to install the latest version of torch and torchvision. You
-# can find the most recent instructions on local installation here
-# https://pytorch.org/get-started/locally/. For example, to install on
+# can find the most recent instructions on local installation `here
+# <https://pytorch.org/get-started/locally/>`_. For example, to install on
 # Mac:
 #
 # .. code:: shell
@@ -149,10 +149,10 @@
 # Download the dataset
 # --------------------
 #
-# Before running MRPC tasks we download the GLUE data
-# (https://gluebenchmark.com/tasks) by running this script
-# (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e,
-# https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py)
+# Before running MRPC tasks we download the `GLUE data
+# <https://gluebenchmark.com/tasks>`_ by running this `script
+# <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_ followed by
+# `download_glue_data <https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py>`_.
 # and unpack it to some directory “glue_data/MRPC”.
 #
 
@@ -176,8 +176,7 @@
 # Convert the texts into features
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# glue_convert_examples_to_features (
-# https://github.com/huggingface/transformers/blob/master/transformers/data/processors/glue.py)
+# `glue_convert_examples_to_features <https://github.com/huggingface/transformers/blob/master/transformers/data/processors/glue.py>`_.
 # load a data file into a list of ``InputFeatures``.
 #
 # -  Tokenize the input sequences;
@@ -190,8 +189,7 @@
 # F1 metric
 # ~~~~~~~~~
 #
-# The F1 score
-# (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
+# The `F1 score <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html>`_
 # can be interpreted as a weighted average of the precision and recall,
 # where an F1 score reaches its best value at 1 and worst score at 0. The
 # relative contribution of precision and recall to the F1 score are equal.
@@ -217,7 +215,7 @@
 #
 # To fine-tune the pre-trained BERT model (“bert-base-uncased” model in
 # HuggingFace transformers) for the MRPC task, you can follow the command
-# in (https://github.com/huggingface/transformers/tree/master/examples):
+# in `examples<https://github.com/huggingface/transformers/tree/master/examples>`_"
 #
 # ::
 #
@@ -333,10 +331,8 @@ def set_seed(seed):
 # Define the tokenize and evaluation function
 # -------------------------------------------
 #
-# We reuse the tokenize and evaluation function from
-# https://github.com/huggingface/transformers/blob/master/examples/run_glue.py.
+# We reuse the tokenize and evaluation function from `huggingface <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`_.
 #
-
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
@@ -598,8 +594,8 @@ def time_model_evaluation(model, configs, tokenizer):
 # set multi-thread by ``torch.set_num_threads(N)`` (``N`` is the number of
 # intra-op parallelization threads). One preliminary requirement to enable
 # the intra-op parallelization support is to build PyTorch with the right
-# backend such as OpenMP, Native, or TBB
-# (https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#build-options).
+# `backend <https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#build-options>`_
+# such as OpenMP, Native or TBB.
 # You can use ``torch.__config__.parallel_info()`` to check the
 # parallelization settings. On the same MacBook Pro using PyTorch with
 # Native backend for parallelization, we can get about 46 seconds for

diff --git a/intermediate_source/quantized_transfer_learning_tutorial.py b/intermediate_source/quantized_transfer_learning_tutorial.py
@@ -84,9 +84,11 @@
 
 
 ######################################################################
-# Load Data (section not needed as it is covered in the original tutorial)
+# Load Data
 # ------------------------------------------------------------------------
 #
+# ..Note :: This section is identical to the original transfer learning tutorial.
+#
 # We will use ``torchvision`` and ``torch.utils.data`` packages to load
 # the data.
 #
@@ -360,7 +362,7 @@ def visualize_model(model, rows=3, cols=3):
 # **Notice that when isolating the feature extractor from a quantized
 # model, you have to place the quantizer in the beginning and in the end
 # of it.**
-#
+# We write a helper function to create a model with a custom head.
 
 from torch import nn
 
@@ -394,8 +396,6 @@ def create_combined_model(model_fe):
   )
   return new_model
 
-new_model = create_combined_model(model_fe)
-
 
 ######################################################################
 # .. warning:: Currently the quantized models can only be run on CPU.
@@ -404,6 +404,7 @@ def create_combined_model(model_fe):
 #
 
 import torch.optim as optim
+new_model = create_combined_model(model_fe)
 new_model = new_model.to('cpu')
 
 criterion = nn.CrossEntropyLoss()
@@ -431,7 +432,7 @@ def create_combined_model(model_fe):
 
 
 ######################################################################
-# **Part 2. Finetuning the quantizable model**
+# Part 2. Finetuning the quantizable model
 #
 # In this part, we fine tune the feature extractor used for transfer
 # learning, and quantize the feature extractor. Note that in both part 1
@@ -446,18 +447,21 @@ def create_combined_model(model_fe):
 # datasets.
 #
 # The pretrained feature extractor must be quantizable, i.e we need to do
-# the following: 1. Fuse (Conv, BN, ReLU), (Conv, BN) and (Conv, ReLU)
-# using torch.quantization.fuse_modules. 2. Connect the feature extractor
-# with a custom head. This requires dequantizing the output of the feature
-# extractor. 3. Insert fake-quantization modules at appropriate locations
-# in the feature extractor to mimic quantization during training.
+# the following:
+#  1. Fuse (Conv, BN, ReLU), (Conv, BN) and (Conv, ReLU)
+#     using torch.quantization.fuse_modules.
+#  2. Connect the feature extractor
+#     with a custom head. This requires dequantizing the output of the feature
+#     extractor.
+#  3. Insert fake-quantization modules at appropriate locations
+#     in the feature extractor to mimic quantization during training.
 #
 # For step (1), we use models from torchvision/models/quantization, which
 # support a member method fuse_model, which fuses all the conv, bn, and
 # relu modules. In general, this would require calling the
 # torch.quantization.fuse_modules API with the list of modules to fuse.
 #
-# Step (2) is done by the function create_custom_model function that we
+# Step (2) is done by the function create_combined_model function that we
 # used in the previous section.
 #
 # Step (3) is achieved by using torch.quantization.prepare_qat, which
@@ -534,4 +538,3 @@ def create_combined_model(model_fe):
 plt.ioff()
 plt.tight_layout()
 plt.show()
-