From 1ceb5e6915d265018003f115118ecbc68a59a920 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Mon, 17 Apr 2023 16:09:04 -0700
Subject: [PATCH 1/3] Pyspelling: Python intermediate tutorials A-M

---
 .pyspelling.yml                               | 15 ++++
 en-wordlist.txt                               | 81 +++++++++++++++++++
 .../autograd_saved_tensors_hooks_tutorial.py  | 28 ++++---
 .../ax_multiobjective_nas_tutorial.py         | 46 +++++------
 .../char_rnn_classification_tutorial.py       | 10 +--
 .../char_rnn_generation_tutorial.py           |  6 +-
 .../custom_function_conv_bn_tutorial.py       | 58 ++++++-------
 intermediate_source/ensembling.py             | 30 +++----
 .../flask_rest_api_tutorial.py                |  4 +-
 intermediate_source/forward_ad_usage.py       | 28 +++----
 intermediate_source/fx_conv_bn_fuser.py       |  8 +-
 intermediate_source/fx_profiling_tutorial.py  | 10 +--
 intermediate_source/jacobians_hessians.py     | 58 ++++++-------
 intermediate_source/mario_rl_tutorial.py      | 21 +++--
 intermediate_source/memory_format_tutorial.py | 46 +++++------
 intermediate_source/mnist_train_nas.py        |  2 +-
 .../model_parallel_tutorial.py                |  6 +-
 17 files changed, 276 insertions(+), 181 deletions(-)

diff --git a/.pyspelling.yml b/.pyspelling.yml
index 015ac975b7f..9dce7c8215a 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -3,6 +3,21 @@ matrix:
 - name: python
   sources:
     - beginner_source/*.py
+    - intermediate_source/autograd_saved_tensors_hooks_tutorial.py
+    - intermediate_source/ax_multiobjective_nas_tutorial.py
+    - intermediate_source/char_rnn_classification_tutorial.py
+    - intermediate_source/char_rnn_generation_tutorial.py
+    - intermediate_source/custom_function_conv_bn_tutorial.py
+    - intermediate_source/ensembling.py
+    #- intermediate_source/flask_rest_api_tutorial.py
+    - intermediate_source/forward_ad_usage.py
+    - intermediate_source/fx_conv_bn_fuser.py
+    - intermediate_source/fx_profiling_tutorial.py
+    - intermediate_source/jacobians_hessians.py
+    - intermediate_source/mario_rl_tutorial.py
+    - intermediate_source/mnist_train_nas.py
+    - intermediate_source/memory_format_tutorial.py
+    - intermediate_source/model_parallel_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt
diff --git a/en-wordlist.txt b/en-wordlist.txt
index c1447668122..7fd34f1ee56 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -2,6 +2,7 @@ APIs
 Args
 Autograd
 BCE
+BN
 BOS
 Bahdanau
 BatchNorm
@@ -10,18 +11,26 @@ CIFAR
 CLS
 CNNDM
 CNNs
+CPUs
 CUDA
 Chatbots
 Colab
 Conv
 ConvNet
+ConvNets
 DCGAN
 DCGANs
+DDQN
+DNN
 DataLoaders
 DeiT
+DenseNet
 EOS
+FC
 FGSM
 FLAVA
+FX
+FX's
 FloydHub
 FloydHub's
 GAN
@@ -29,57 +38,85 @@ GANs
 GPUs
 GRU
 GRUs
+GeForce
 Goodfellow
 Goodfellow’s
 GreedySearchDecoder
+HVP
 Hugging Face
 IMDB
 ImageNet
 Initializations
 Iteratively
 JSON
+JVP
+Jacobian
+Kiuk
+Kubernetes
 Kuei
 LSTM
+LSTMs
 LeNet
 LeakyReLU
 LeakyReLUs
+Lua
 Luong
 MLP
+MLPs
 MNIST
 Mypy
+NAS
+NCHW
+NES
 NLP
 NaN
 NeurIPS
 NumPy
 Numericalization
 Numpy's
+OpenAI
+Plotly
+Prec
 Profiler
 PyTorch's
 RGB
+RL
 RNN
 RNNs
+RTX
 Radford
 ReLU
+ResNet
 SST2
+Sequentials
 Sigmoid
 SoTA
+TPU
 TensorBoard
 TextVQA
 Tokenization
 TorchMultimodal
 TorchScript
+TorchX
+Tunable
 Unescape
 VQA
 Wikitext
+Xeon
 accuracies
 activations
 adversarially
 al
+autodiff
+autograd
 backend
+backends
 backprop
+backpropagate
 backpropagated
 backpropagates
 backpropagation
+batchnorm
 batchnorm's
 benchmarking
 boolean
@@ -89,12 +126,15 @@ chatbot's
 checkpointing
 composable
 concat
+config
 contrastive
 conv
 convolutional
 cpu
 csv
+cuDNN
 datafile
+dataframe
 dataloader
 dataloaders
 datapipes
@@ -105,26 +145,43 @@ deserialize
 deserialized
 dir
 downsample
+downsamples
 embeddings
 encodings
+ensembling
 eq
 et
 evaluateInput
+extensibility
 fastai
 fbgemm
 feedforward
 finetune
 finetuning
+fp
+functorch
+fuser
+grayscale
+hardcode
 helpdesk
 helpdesks
+hessian
+hessians
+hvp
 hyperparameter
 hyperparameters
 imagenet
+initializations
+inlined
+interpretable
 io
 iterable
 iteratively
+jacobian
+jacobians
 jit
 jpg
+kwargs
 labelled
 learnable
 loadFilename
@@ -139,6 +196,7 @@ modularity
 modularized
 multimodal
 multimodality
+multiobjective
 multithreaded
 namespace
 natively
@@ -153,26 +211,37 @@ overfitting
 parallelizable
 parallelization
 perceptibility
+pipelining
+pointwise
+precomputing
 prepend
 preprocess
 preprocessing
+prespecified
 pretrained
 prewritten
+primals
 profiler
 profilers
 pytorch
 quantized
 quantizing
+queryable
 randint
 readably
 reinitializes
 relu
 reproducibility
 rescale
+resnet
+restride
 rewinded
+romanized
+runnable
 runtime
 runtime
 runtimes
+scalable
 softmax
 src
 stacktrace
@@ -180,29 +249,41 @@ stateful
 storages
 strided
 subclasses
+subclassing
 subdirectories
 submodule
+subreddit
 summarization
 tanh
 th
 thresholding
+timestep
+timesteps
 tokenization
 tokenize
 tokenizer
 torchaudio
 torchdata
+torchscriptable
 torchtext
 torchtext's
 torchvision
+torchviz
 traceback
 tradeoff
+tradeoffs
 uncomment
 uncommented
+unfused
 unimodal
 unnormalized
 unpickling
+updation
 utils
+vectorization
+vectorize
 vectorized
+vhp
 voc
 walkthrough
 warmstart
diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
index f549301626d..f16b170ee6a 100644
--- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
+++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py
@@ -1,6 +1,6 @@
 """
 Hooks for autograd saved tensors
-=======================
+================================
 
 """
 
@@ -13,8 +13,7 @@
 # packing/unpacking process.
 #
 # This tutorial assumes you are familiar with how backpropagation works in
-# theory. If not, read this first:
-# https://colab.research.google.com/drive/1aWNdmYt7RcHMbUk-Xz2Cv5-cGFSWPXe0#scrollTo=AHcEJ6nXUb7W
+# theory. If not, read `this <https://colab.research.google.com/drive/1aWNdmYt7RcHMbUk-Xz2Cv5-cGFSWPXe0#scrollTo=AHcEJ6nXUb7W>`_ first.
 #
 
 
@@ -107,7 +106,7 @@ def f(x):
 
 ######################################################################
 # In the example above, executing without grad would only have kept ``x``
-# and ``y`` in the scope, But the graph additionnally stores ``f(x)`` and
+# and ``y`` in the scope, But the graph additionally stores ``f(x)`` and
 # ``f(f(x))``. Hence, running a forward pass during training will be more
 # costly in memory usage than during evaluation (more precisely, when
 # autograd is not required).
@@ -182,7 +181,7 @@ def unpack_hook(x):
 
 
 ######################################################################
-# The ``pack_hook`` function will be called everytime an operation saves
+# The ``pack_hook`` function will be called every time an operation saves
 # a tensor for backward.
 # The output of ``pack_hook`` is then stored in the computation graph
 # instead of the original tensor.
@@ -218,8 +217,9 @@ def unpack_hook(x):
 #
 
 ######################################################################
-# **Returning and int**
-
+# Returning an ``int``
+# ^^^^^^^^^^^^^^^^^^^^
+#
 # Returning the index of a Python list
 # Relatively harmless but with debatable usefulness
 
@@ -240,8 +240,9 @@ def unpack(x):
 assert(x.grad.equal(2 * x))
 
 ######################################################################
-# **Returning a tuple**
-
+# Returning a tuple
+# ^^^^^^^^^^^^^^^^^
+#
 # Returning some tensor and a function how to unpack it
 # Quite unlikely to be useful in its current form
 
@@ -262,9 +263,10 @@ def unpack(packed):
 assert(torch.allclose(x.grad, 2 * x))
 
 ######################################################################
-# **Returning a str**
-
-# Returning the __repr__ of the tensor
+# Returning a ``str``
+# ^^^^^^^^^^^^^^^^^^^
+#
+# Returning the ``__repr__ of`` the tensor
 # Probably never do this
 
 x = torch.randn(5, requires_grad=True)
@@ -337,7 +339,7 @@ def forward(self, x):
 
 
 ######################################################################
-# In practice, on a A100 GPU, for a resnet-152 with batch size 256, this
+# In practice, on a A100 GPU, for a ResNet-152 with batch size 256, this
 # corresponds to a GPU memory usage reduction from 48GB to 5GB, at the
 # cost of a 6x slowdown.
 #
diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py
index 7c43f59473c..79b096b9e64 100644
--- a/intermediate_source/ax_multiobjective_nas_tutorial.py
+++ b/intermediate_source/ax_multiobjective_nas_tutorial.py
@@ -48,7 +48,7 @@
 # Our goal is to optimize the PyTorch Lightning training job defined in
 # `mnist_train_nas.py <https://github.com/pytorch/tutorials/tree/main/intermediate_source/mnist_train_nas.py>`__.
 # To do this using TorchX, we write a helper function that takes in
-# the values of the architcture and hyperparameters of the training
+# the values of the architecture and hyperparameters of the training
 # job and creates a `TorchX AppDef <https://pytorch.org/torchx/latest/basics.html>`__
 # with the appropriate settings.
 #
@@ -72,12 +72,12 @@ def trainer(
     trial_idx: int = -1,
 ) -> specs.AppDef:
 
-    # define the log path so we can pass it to the TorchX AppDef
+    # define the log path so we can pass it to the TorchX ``AppDef``
     if trial_idx >= 0:
         log_path = Path(log_path).joinpath(str(trial_idx)).absolute().as_posix()
 
     return utils.python(
-        # command line args to the training script
+        # command line arguments to the training script
         "--log_path",
         log_path,
         "--hidden_size_1",
@@ -126,15 +126,15 @@ def trainer(
     tracker_base="/tmp/",
     component=trainer,
     # NOTE: To launch this job on a cluster instead of locally you can
-    # specify a different scheduler and adjust args appropriately.
+    # specify a different scheduler and adjust arguments appropriately.
     scheduler="local_cwd",
     component_const_params={"log_path": log_dir},
     cfg={},
 )
 
 ######################################################################
-# Setting up the SearchSpace
-# --------------------------
+# Setting up the ``SearchSpace``
+# ------------------------------
 #
 # First, we define our search space. Ax supports both range parameters
 # of type integer and float as well as choice parameters which can have
@@ -154,7 +154,7 @@ def trainer(
 parameters = [
     # NOTE: In a real-world setting, hidden_size_1 and hidden_size_2
     # should probably be powers of 2, but in our simple example this
-    # would mean that num_params can't take on that many values, which
+    # would mean that ``num_params`` can't take on that many values, which
     # in turn makes the Pareto frontier look pretty weird.
     RangeParameter(
         name="hidden_size_1",
@@ -189,7 +189,7 @@ def trainer(
         upper=0.5,
         parameter_type=ParameterType.FLOAT,
     ),
-    ChoiceParameter(  # NOTE: ChoiceParameters don't require log-scale
+    ChoiceParameter(  # NOTE: ``ChoiceParameters`` don't require log-scale
         name="batch_size",
         values=[32, 64, 128, 256],
         parameter_type=ParameterType.INT,
@@ -212,7 +212,7 @@ def trainer(
 #
 # Ax has the concept of a `Metric <https://ax.dev/api/core.html#metric>`__
 # that defines properties of outcomes and how observations are obtained
-# for these outcomes. This allows e.g. encodig how data is fetched from
+# for these outcomes. This allows e.g. encoding how data is fetched from
 # some distributed execution backend and post-processed before being
 # passed as input to Ax.
 #
@@ -229,7 +229,7 @@ def trainer(
 # index (see the ``trainer()`` function above). We will define a metric
 # class that is aware of that logging directory. By subclassing
 # `TensorboardCurveMetric <https://ax.dev/api/metrics.html?highlight=tensorboardcurvemetric#ax.metrics.tensorboard.TensorboardCurveMetric>`__
-# we get the logic to read and parse the Tensorboard logs for free.
+# we get the logic to read and parse the TensorBoard logs for free.
 #
 
 from ax.metrics.tensorboard import TensorboardCurveMetric
@@ -237,10 +237,10 @@ def trainer(
 
 class MyTensorboardMetric(TensorboardCurveMetric):
 
-    # NOTE: We need to tell the new Tensorboard metric how to get the id /
-    # file handle for the tensorboard logs from a trial. In this case
+    # NOTE: We need to tell the new TensorBoard metric how to get the id /
+    # file handle for the TensorBoard logs from a trial. In this case
     # our convention is to just save a separate file per trial in
-    # the pre-specified log dir.
+    # the prespecified log dir.
     @classmethod
     def get_ids_from_trials(cls, trials):
         return {
@@ -257,9 +257,9 @@ def is_available_while_running(cls):
 
 
 ######################################################################
-# Now we can instatiate the metrics for accuracy and the number of
+# Now we can instantiate the metrics for accuracy and the number of
 # model parameters. Here `curve_name` is the name of the metric in the
-# Tensorboard logs, while `name` is the metric name used internally
+# TensorBoard logs, while `name` is the metric name used internally
 # by Ax. We also specify `lower_is_better` to indicate the favorable
 # direction of the two metrics.
 #
@@ -277,8 +277,8 @@ def is_available_while_running(cls):
 
 
 ######################################################################
-# Setting up the OptimizationConfig
-# ----------------------------------
+# Setting up the ``OptimizationConfig``
+# -------------------------------------
 #
 # The way to tell Ax what it should optimize is by means of an
 # `OptimizationConfig <https://ax.dev/api/core.html#module-ax.core.optimization_config>`__.
@@ -335,8 +335,8 @@ def is_available_while_running(cls):
 )
 
 ######################################################################
-# Choosing the GenerationStrategy
-# -------------------------------
+# Choosing the Generation Strategy
+# --------------------------------
 #
 # A `GenerationStrategy <https://ax.dev/api/modelbridge.html#ax.modelbridge.generation_strategy.GenerationStrategy>`__
 # is the abstract representation of how we would like to perform the
@@ -366,7 +366,7 @@ def is_available_while_running(cls):
 # Configuring the Scheduler
 # -------------------------
 #
-# The `Scheduler` (TODO: link) acts as the loop control for the optimization.
+# The ``Scheduler`` acts as the loop control for the optimization.
 # It communicates with the backend to launch trials, check their status,
 # and retrieve results. In the case of this tutorial, it is simply reading
 # and parsing the locally saved logs. In a remote execution setting,
@@ -404,7 +404,7 @@ def is_available_while_running(cls):
 # ------------------------
 #
 # Now that everything is configured, we can let Ax run the optimization
-# in a fully automated fashion. The Scheduler will periodially check
+# in a fully automated fashion. The Scheduler will periodically check
 # the logs for the status of all currently running trials, and if a
 # trial completes the scheduler will update its status on the
 # experiment and fetch the observations needed for the Bayesian
@@ -479,7 +479,7 @@ def is_available_while_running(cls):
 from ax.plot.diagnostic import interact_cross_validation_plotly
 from ax.utils.notebook.plotting import init_notebook_plotting, render
 
-cv = cross_validate(model=gs.model)  # The surrogate model is stored on the GenerationStrategy
+cv = cross_validate(model=gs.model)  # The surrogate model is stored on the ``GenerationStrategy``
 compute_diagnostics(cv)
 
 interact_cross_validation_plotly(cv)
@@ -508,7 +508,7 @@ def is_available_while_running(cls):
 
 
 ######################################################################
-# Acknowledgements
+# Acknowledgments
 # ----------------
 #
 # We thank the TorchX team (in particular Kiuk Chung and Tristan Rice)
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index 78cbc111151..f36b92fb17e 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -61,7 +61,7 @@
    and extract it to the current directory.
 
 Included in the ``data/names`` directory are 18 text files named as
-"[Language].txt". Each file contains a bunch of names, one name per
+``[Language].txt``. Each file contains a bunch of names, one name per
 line, mostly romanized (but we still need to convert from Unicode to
 ASCII).
 
@@ -179,7 +179,7 @@ def lineToTensor(line):
 # tutorial <https://pytorch.org/tutorials/beginner/former_torchies/
 # nn_tutorial.html#example-2-recurrent-net>`__)
 # is just 2 linear layers which operate on an input and hidden state, with
-# a LogSoftmax layer after the output.
+# a ``LogSoftmax`` layer after the output.
 #
 # .. figure:: https://i.imgur.com/Z2xbySO.png
 #    :alt:
@@ -230,7 +230,7 @@ def initHidden(self):
 # For the sake of efficiency we don't want to be creating a new Tensor for
 # every step, so we will use ``lineToTensor`` instead of
 # ``letterToTensor`` and use slices. This could be further optimized by
-# pre-computing batches of Tensors.
+# precomputing batches of Tensors.
 #
 
 input = lineToTensor('Albert')
@@ -372,7 +372,7 @@ def timeSince(since):
     output, loss = train(category_tensor, line_tensor)
     current_loss += loss
 
-    # Print iter number, loss, name and guess
+    # Print ``iter`` number, loss, name and guess
     if iter % print_every == 0:
         guess, guess_i = categoryFromOutput(output)
         correct = '✓' if guess == category else '✗ (%s)' % category
@@ -495,7 +495,7 @@ def predict(input_line, n_predictions=3):
 # -  ``model.py`` (defines the RNN)
 # -  ``train.py`` (runs training)
 # -  ``predict.py`` (runs ``predict()`` with command line arguments)
-# -  ``server.py`` (serve prediction as a JSON API with bottle.py)
+# -  ``server.py`` (serve prediction as a JSON API with ``bottle.py``)
 #
 # Run ``train.py`` to train and save the network.
 #
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
index ee7b0d14fd3..431c2bf43d9 100644
--- a/intermediate_source/char_rnn_generation_tutorial.py
+++ b/intermediate_source/char_rnn_generation_tutorial.py
@@ -234,7 +234,7 @@ def inputTensor(line):
         tensor[li][0][all_letters.find(letter)] = 1
     return tensor
 
-# LongTensor of second letter to end (EOS) for target
+# ``LongTensor`` of second letter to end (EOS) for target
 def targetTensor(line):
     letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
     letter_indexes.append(n_letters - 1) # EOS
@@ -322,7 +322,7 @@ def timeSince(since):
 print_every = 5000
 plot_every = 500
 all_losses = []
-total_loss = 0 # Reset every plot_every iters
+total_loss = 0 # Reset every ``plot_every`` ``iters``
 
 start = time.time()
 
@@ -429,6 +429,6 @@ def samples(category, start_letters='ABC'):
 #    choosing a start letter
 # -  Get better results with a bigger and/or better shaped network
 #
-#    -  Try the nn.LSTM and nn.GRU layers
+#    -  Try the ``nn.LSTM`` and ``nn.GRU`` layers
 #    -  Combine multiple of these RNNs as a higher level network
 #
diff --git a/intermediate_source/custom_function_conv_bn_tutorial.py b/intermediate_source/custom_function_conv_bn_tutorial.py
index 8838fc0d32a..a9fcd8838ae 100644
--- a/intermediate_source/custom_function_conv_bn_tutorial.py
+++ b/intermediate_source/custom_function_conv_bn_tutorial.py
@@ -35,7 +35,7 @@
 For simplicity, in this tutorial we hardcode `bias=False`, `stride=1`, `padding=0`, `dilation=1`,
 and `groups=1` for Conv2D. For BatchNorm2D, we hardcode `eps=1e-3`, `momentum=0.1`,
 `affine=False`, and `track_running_statistics=False`. Another small difference
-is that we add epsilon in the denomator outside of the square root in the computation
+is that we add epsilon in the denominator outside of the square root in the computation
 of batch norm.
 
 [0] https://nenadmarkus.com/p/fusing-batchnorm-and-conv/
@@ -72,7 +72,7 @@ def backward(ctx, grad_out):
         return convolution_backward(grad_out, X, weight)
 
 ######################################################################
-# When testing with gradcheck, it is important to use double precision
+# When testing with ``gradcheck``, it is important to use double precision
 weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double)
 X = torch.rand(10, 3, 7, 7, requires_grad=True, dtype=torch.double)
 torch.autograd.gradcheck(Conv2D.apply, (X, weight))
@@ -80,38 +80,38 @@ def backward(ctx, grad_out):
 ######################################################################
 # Backward Formula Implementation for Batch Norm
 # -------------------------------------------------------------------
-# Batch Norm has two modes: training and eval mode. In training mode
-# the sample statistics are a function of the inputs. In eval mode,
+# Batch Norm has two modes: training and ``eval`` mode. In training mode
+# the sample statistics are a function of the inputs. In ``eval`` mode,
 # we use the saved running statistics, which are not a function of the inputs.
 # This makes non-training mode's backward significantly simpler. Below
 # we implement and test only the training mode case.
 def unsqueeze_all(t):
-    # Helper function to unsqueeze all the dimensions that we reduce over
+    # Helper function to ``unsqueeze`` all the dimensions that we reduce over
     return t[None, :, None, None]
 
 def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps):
-    # We use the formula: out = (X - mean(X)) / (sqrt(var(X)) + eps)
-    # in batch norm 2d's forward. To simplify our derivation, we follow the
+    # We use the formula: ``out = (X - mean(X)) / (sqrt(var(X)) + eps)``
+    # in batch norm 2D forward. To simplify our derivation, we follow the
     # chain rule and compute the gradients as follows before accumulating
     # them all into a final grad_input.
-    #  1) 'grad of out wrt var(X)' * 'grad of var(X) wrt X'
-    #  2) 'grad of out wrt mean(X)' * 'grad of mean(X) wrt X'
-    #  3) 'grad of out wrt X in the numerator' * 'grad of X wrt X'
+    #  1) ``grad of out wrt var(X)`` * ``grad of var(X) wrt X``
+    #  2) ``grad of out wrt mean(X)`` * ``grad of mean(X) wrt X``
+    #  3) ``grad of out wrt X in the numerator`` * ``grad of X wrt X``
     # We then rewrite the formulas to use as few extra buffers as possible
     tmp = ((X - unsqueeze_all(sum) / N) * grad_out).sum(dim=(0, 2, 3))
     tmp *= -1
-    d_denom = tmp / (sqrt_var + eps)**2  # d_denom = -num / denom**2
-    # It is useful to delete tensors when you no longer need them with `del`
-    # For example, we could've done `del tmp` here because we won't use it later
-    # In this case, it's not a big difference because tmp only has size of (C,)
+    d_denom = tmp / (sqrt_var + eps)**2  # ``d_denom = -num / denom**2``
+    # It is useful to delete tensors when you no longer need them with ``del``
+    # For example, we could've done ``del tmp`` here because we won't use it later
+    # In this case, it's not a big difference because ``tmp`` only has size of (C,)
     # The important thing is avoid allocating NCHW-sized tensors unnecessarily
-    d_var = d_denom / (2 * sqrt_var)  # denom = torch.sqrt(var) + eps
-    # Compute d_mean_dx before allocating the final NCHW-sized grad_input buffer
+    d_var = d_denom / (2 * sqrt_var)  # ``denom = torch.sqrt(var) + eps``
+    # Compute ``d_mean_dx`` before allocating the final NCHW-sized grad_input buffer
     d_mean_dx = grad_out / unsqueeze_all(sqrt_var + eps)
     d_mean_dx = unsqueeze_all(-d_mean_dx.sum(dim=(0, 2, 3)) / N)
-    # d_mean_dx has already been reassigned to a C-sized buffer so no need to worry
+    # ``d_mean_dx`` has already been reassigned to a C-sized buffer so no need to worry
 
-    # (1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)
+    # ``(1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)``
     grad_input = X * unsqueeze_all(d_var * N)
     grad_input += unsqueeze_all(-d_var * sum)
     grad_input *= 2 / ((N - 1) * N)
@@ -120,13 +120,13 @@ def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps):
     # (3) Add 'grad_out / <factor>' without allocating an extra buffer
     grad_input *= unsqueeze_all(sqrt_var + eps)
     grad_input += grad_out
-    grad_input /= unsqueeze_all(sqrt_var + eps)  # sqrt_var + eps > 0!
+    grad_input /= unsqueeze_all(sqrt_var + eps)  # ``sqrt_var + eps > 0!``
     return grad_input
 
 class BatchNorm(torch.autograd.Function):
     @staticmethod
     def forward(ctx, X, eps=1e-3):
-        # Don't save keepdim'd values for backward
+        # Don't save ``keepdim`` values for backward
         sum = X.sum(dim=(0, 2, 3))
         var = X.var(unbiased=True, dim=(0, 2, 3))
         N = X.numel() / X.size(1)
@@ -149,7 +149,7 @@ def backward(ctx, grad_out):
         return batch_norm_backward(grad_out, X, ctx.sum, ctx.sqrt_var, ctx.N, ctx.eps)
 
 ######################################################################
-# Testing with gradcheck
+# Testing with ``gradcheck``
 a = torch.rand(1, 2, 3, 4, requires_grad=True, dtype=torch.double)
 torch.autograd.gradcheck(BatchNorm.apply, (a,), fast_mode=False)
 
@@ -228,7 +228,7 @@ def reset_parameters(self) -> None:
         nn.init.kaiming_uniform_(self.conv_weight, a=math.sqrt(5))
 
 ######################################################################
-# Use gradcheck to validate the correctness of our backward formula
+# Use ``gradcheck`` to validate the correctness of our backward formula
 weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double)
 X = torch.rand(2, 3, 4, 4, requires_grad=True, dtype=torch.double)
 torch.autograd.gradcheck(FusedConvBN2DFunction.apply, (X, weight))
@@ -236,7 +236,7 @@ def reset_parameters(self) -> None:
 ######################################################################
 # Testing out our new Layer
 # -------------------------------------------------------------------
-# Use FusedConvBN to train a basic network
+# Use ``FusedConvBN`` to train a basic network
 # The code below is after some light modifications to the example here:
 # https://github.com/pytorch/examples/tree/master/mnist
 import torch.optim as optim
@@ -350,20 +350,20 @@ def test(model, device, test_loader):
 ######################################################################
 # A Comparison of Memory Usage
 # -------------------------------------------------------------------
-# If cuda is enabled, print out memory usage for both `fused=True` and `fused=False`
-# For an example run on RTX 3070, CuDNN 8.0.5: fused peak memory: 1.56GB,
+# If CUDA is enabled, print out memory usage for both `fused=True` and `fused=False`
+# For an example run on NVIDIA GeForce RTX 3070, NVIDIA CUDA® Deep Neural Network library (cuDNN) 8.0.5: fused peak memory: 1.56GB,
 # unfused peak memory: 2.68GB
 #
 # It is important to note that the *peak* memory usage for this model may vary depending
-# the specific CuDNN convolution algorithm used. For shallower models, it
+# the specific cuDNN convolution algorithm used. For shallower models, it
 # may be possible for the peak memory allocated of the fused model to exceed
 # that of the unfused model! This is because the memory allocated to compute
-# certain CuDNN convolution algorithms can be high enough to "hide" the typical peak
+# certain cuDNN convolution algorithms can be high enough to "hide" the typical peak
 # you would expect to be near the start of the backward pass.
 #
 # For this reason, we also record and display the memory allocated at the end
 # of the forward pass as an approximation, and to demonstrate that we indeed
-# allocate one fewer buffer per fused conv-bn pair.
+# allocate one fewer buffer per fused ``conv-bn`` pair.
 from statistics import mean
 
 torch.backends.cudnn.enabled = True
@@ -384,7 +384,7 @@ def test(model, device, test_loader):
             scheduler.step()
         peak_memory_allocated.append(torch.cuda.max_memory_allocated())
         torch.cuda.reset_peak_memory_stats()
-    print("CuDNN version:", torch.backends.cudnn.version())
+    print("cuDNN version:", torch.backends.cudnn.version())
     print()
     print("Peak memory allocated:")
     print(f"fused: {peak_memory_allocated[0]/1024**3:.2f}GB, unfused: {peak_memory_allocated[1]/1024**3:.2f}GB")
diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py
index 8b3c21e4086..8102b7bc184 100644
--- a/intermediate_source/ensembling.py
+++ b/intermediate_source/ensembling.py
@@ -77,15 +77,15 @@ def forward(self, x):
 predictions2 = [model(minibatch) for model in models]
 
 ######################################################################
-# Using vmap to vectorize the ensemble
+# Using ``vmap`` to vectorize the ensemble
 # ------------------------------------
 #
-# Let's use vmap to speed up the for-loop. We must first prepare the models
-# for use with vmap.
+# Let's use ``vmap`` to speed up the for-loop. We must first prepare the models
+# for use with ``vmap``.
 #
 # First, let’s combine the states of the model together by stacking each
 # parameter. For example, ``model[i].fc1.weight`` has shape ``[784, 128]``; we are
-# going to stack the .fc1.weight of each of the 10 models to produce a big
+# going to stack the ``.fc1.weight`` of each of the 10 models to produce a big
 # weight of shape ``[10, 784, 128]``.
 #
 # PyTorch offers the ``torch.func.stack_module_state`` convenience function to do
@@ -95,7 +95,7 @@ def forward(self, x):
 params, buffers = stack_module_state(models)
 
 ######################################################################
-# Next, we need to define a function to vmap over. The function should,
+# Next, we need to define a function to ``vmap`` over. The function should,
 # given parameters and buffers and inputs, run the model using those
 # parameters, buffers, and inputs. We'll use ``torch.func.functional_call``
 # to help out:
@@ -114,9 +114,9 @@ def fmodel(params, buffers, x):
 ######################################################################
 # Option 1: get predictions using a different minibatch for each model.
 #
-# By default, vmap maps a function across the first dimension of all inputs to
+# By default, ``vmap`` maps a function across the first dimension of all inputs to
 # the passed-in function. After using ``stack_module_state``, each of
-# the params and buffers have an additional dimension of size 'num_models' at
+# the ``params`` and buffers have an additional dimension of size 'num_models' at
 # the front, and minibatches has a dimension of size 'num_models'.
 
 print([p.size(0) for p in params.values()]) # show the leading 'num_models' dimension
@@ -127,14 +127,14 @@ def fmodel(params, buffers, x):
 
 predictions1_vmap = vmap(fmodel)(params, buffers, minibatches)
 
-# verify the vmap predictions match the
+# verify the ``vmap`` predictions match the
 assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5)
 
 ######################################################################
 # Option 2: get predictions using the same minibatch of data.
 #
-# vmap has an in_dims arg that specifies which dimensions to map over.
-# By using ``None``, we tell vmap we want the same minibatch to apply for all of
+# ``vmap`` has an ``in_dims`` argument that specifies which dimensions to map over.
+# By using ``None``, we tell ``vmap`` we want the same minibatch to apply for all of
 # the 10 models.
 
 predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch)
@@ -143,9 +143,9 @@ def fmodel(params, buffers, x):
 
 ######################################################################
 # A quick note: there are limitations around what types of functions can be
-# transformed by vmap. The best functions to transform are ones that are pure
+# transformed by ``vmap``. The best functions to transform are ones that are pure
 # functions: a function where the outputs are only determined by the inputs
-# that have no side effects (e.g. mutation). vmap is unable to handle mutation
+# that have no side effects (e.g. mutation). ``vmap`` is unable to handle mutation
 # of arbitrary Python data structures, but it is able to handle many in-place
 # PyTorch operations.
 
@@ -165,11 +165,11 @@ def fmodel(params, buffers, x):
 print(f'Predictions with vmap {with_vmap.timeit(100)}')
 
 ######################################################################
-# There's a large speedup using vmap!
+# There's a large speedup using ``vmap``!
 #
-# In general, vectorization with vmap should be faster than running a function
+# In general, vectorization with ``vmap`` should be faster than running a function
 # in a for-loop and competitive with manual batching. There are some exceptions
-# though, like if we haven’t implemented the vmap rule for a particular
+# though, like if we haven’t implemented the ``vmap`` rule for a particular
 # operation or if the underlying kernels weren’t optimized for older hardware
 # (GPUs). If you see any of these cases, please let us know by opening an issue
 # on GitHub.
diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py
index 39c1a9d39f7..690fa975a5c 100644
--- a/intermediate_source/flask_rest_api_tutorial.py
+++ b/intermediate_source/flask_rest_api_tutorial.py
@@ -53,7 +53,7 @@
 # Simple Web Server
 # -----------------
 #
-# Following is a simple webserver, taken from Flask's documentation
+# Following is a simple web server, taken from Flask's documentation
 
 
 from flask import Flask
@@ -114,7 +114,7 @@ def predict():
 # ~~~~~~~~~~~~~~~~~~~
 #
 # DenseNet model requires the image to be of 3 channel RGB image of size
-# 224 x 224. We will also normalise the image tensor with the required mean
+# 224 x 224. We will also normalize the image tensor with the required mean
 # and standard deviation values. You can read more about it
 # `here <https://pytorch.org/vision/stable/models.html>`_.
 #
diff --git a/intermediate_source/forward_ad_usage.py b/intermediate_source/forward_ad_usage.py
index ef194d65023..10965d64ab9 100644
--- a/intermediate_source/forward_ad_usage.py
+++ b/intermediate_source/forward_ad_usage.py
@@ -25,7 +25,7 @@
 to dual numbers[0].
 
 As the forward pass is performed, if any input tensors are dual tensors,
-extra computation is performed to propogate this "sensitivity" of the
+extra computation is performed to propagate this "sensitivity" of the
 function.
 
 """
@@ -68,7 +68,7 @@ def fn(x, y):
     plain_tensor = torch.randn(10, 10)
     dual_output = fn(dual_input, plain_tensor)
 
-    # Unpacking the dual returns a namedtuple with ``primal`` and ``tangent``
+    # Unpacking the dual returns a ``namedtuple`` with ``primal`` and ``tangent``
     # as attributes
     jvp = fwAD.unpack_dual(dual_output).tangent
 
@@ -136,7 +136,7 @@ class Fn(torch.autograd.Function):
     @staticmethod
     def forward(ctx, foo):
         result = torch.exp(foo)
-        # Tensors stored in ctx can be used in the subsequent forward grad
+        # Tensors stored in ``ctx`` can be used in the subsequent forward grad
         # computation.
         ctx.result = result
         return result
@@ -144,7 +144,7 @@ def forward(ctx, foo):
     @staticmethod
     def jvp(ctx, gI):
         gO = gI * ctx.result
-        # If the tensor stored in ctx will not also be used in the backward pass,
+        # If the tensor stored in`` ctx`` will not also be used in the backward pass,
         # one can manually free it using ``del``
         del ctx.result
         return gO
@@ -161,9 +161,9 @@ def jvp(ctx, gI):
 
 # It is important to use ``autograd.gradcheck`` to verify that your
 # custom autograd Function computes the gradients correctly. By default,
-# gradcheck only checks the backward-mode (reverse-mode) AD gradients. Specify
+# ``gradcheck`` only checks the backward-mode (reverse-mode) AD gradients. Specify
 # ``check_forward_ad=True`` to also check forward grads. If you did not
-# implement the backward formula for your function, you can also tell gradcheck
+# implement the backward formula for your function, you can also tell ``gradcheck``
 # to skip the tests that require backward-mode AD by specifying
 # ``check_backward_ad=False``, ``check_undefined_grad=False``, and
 # ``check_batched_grad=False``.
@@ -198,11 +198,11 @@ def fn(x, y):
     return x ** 2 + y ** 2
 
 # Here is a basic example to compute the JVP of the above function.
-# The jvp(func, primals, tangents) returns func(*primals) as well as the
-# computed jvp. Each primal must be associated with a tangent of the same shape.
+# The ``jvp(func, primals, tangents)`` returns ``func(*primals)`` as well as the
+# computed Jacobian-vector product (JVP). Each primal must be associated with a tangent of the same shape.
 primal_out, tangent_out = ft.jvp(fn, (primal0, primal1), (tangent0, tangent1))
 
-# functorch.jvp requires every primal to be associated with a tangent.
+# ``functorch.jvp`` requires every primal to be associated with a tangent.
 # If we only want to associate certain inputs to `fn` with tangents,
 # then we'll need to create a new function that captures inputs without tangents:
 primal = torch.randn(10, 10)
@@ -216,7 +216,7 @@ def fn(x, y):
 ######################################################################
 # Using the functional API with Modules
 # --------------------------------------------------------------------
-# To use ``nn.Module`` with functorch.jvp to compute Jacobian-vector products
+# To use ``nn.Module`` with ``functorch.jvp`` to compute Jacobian-vector products
 # with respect to the model parameters, we need to reformulate the
 # ``nn.Module`` as a function that accepts both the model parameters and inputs
 # to the module.
@@ -225,16 +225,16 @@ def fn(x, y):
 input = torch.randn(16, 5)
 tangents = tuple([torch.rand_like(p) for p in model.parameters()])
 
-# Given a torch.nn.Module, ft.make_functional_with_buffers extracts the state
-# (params and buffers) and returns a functional version of the model that
+# Given a ``torch.nn.Module``, ``ft.make_functional_with_buffers`` extracts the state
+# (``params`` and buffers) and returns a functional version of the model that
 # can be invoked like a function.
 # That is, the returned ``func`` can be invoked like
 # ``func(params, buffers, input)``.
-# ft.make_functional_with_buffers is analogous to the nn.Modules stateless API
+# ``ft.make_functional_with_buffers`` is analogous to the ``nn.Modules`` stateless API
 # that you saw previously and we're working on consolidating the two.
 func, params, buffers = ft.make_functional_with_buffers(model)
 
-# Because jvp requires every input to be associated with a tangent, we need to
+# Because ``jvp`` requires every input to be associated with a tangent, we need to
 # create a new function that, when given the parameters, produces the output
 def func_params_only(params):
     return func(params, buffers, input)
diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py
index c06f5f76835..90620ceba4e 100644
--- a/intermediate_source/fx_conv_bn_fuser.py
+++ b/intermediate_source/fx_conv_bn_fuser.py
@@ -32,7 +32,7 @@
 # For this tutorial, we are going to create a model consisting of convolutions
 # and batch norms. Note that this model has some tricky components - some of
 # the conv/batch norm patterns are hidden within Sequentials and one of the
-# BatchNorms is wrapped in another Module.
+# ``BatchNorms`` is wrapped in another Module.
 
 class WrappedBatchNorm(nn.Module):
     def __init__(self):
@@ -137,7 +137,7 @@ def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
 
 def _parent_name(target : str) -> Tuple[str, str]:
     """
-    Splits a qualname into parent path and last atom.
+    Splits a ``qualname`` into parent path and last atom.
     For example, `foo.bar.baz` -> (`foo.bar`, `baz`)
     """
     *parent, name = target.rsplit('.', 1)
@@ -242,9 +242,9 @@ def benchmark(model, iters=20):
 print("Fused time: ", benchmark(fused_rn18))
 ######################################################################
 # As we previously saw, the output of our FX transformation is
-# (Torchscriptable) PyTorch code, we can easily `jit.script` the output to try
+# ("torchscriptable") PyTorch code, we can easily ``jit.script`` the output to try
 # and increase our performance even more. In this way, our FX model
-# transformation composes with Torchscript with no issues.
+# transformation composes with TorchScript with no issues.
 jit_rn18 = torch.jit.script(fused_rn18)
 print("jit time: ", benchmark(jit_rn18))
 
diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py
index 06726e4dd6c..18d8bc67cf4 100644
--- a/intermediate_source/fx_profiling_tutorial.py
+++ b/intermediate_source/fx_profiling_tutorial.py
@@ -117,7 +117,7 @@ def __init__(self, mod : torch.nn.Module):
 
     ######################################################################
     # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run``
-    # method is the top-level entrypoint for execution of the model. We will
+    # method is the top-level entry point for execution of the model. We will
     # want to intercept this so that we can record the total runtime of the
     # model.
 
@@ -129,7 +129,7 @@ def run(self, *args) -> Any:
         # Record the time we finished running the model
         t_end = time.time()
         # Store the total elapsed time this model execution took in the
-        # ProfilingInterpreter
+        # ``ProfilingInterpreter``
         self.total_runtime_sec.append(t_end - t_start)
         return return_val
 
@@ -176,7 +176,7 @@ def summary(self, should_sort : bool = False) -> str:
             # time each node took with respect to the whole network.
             pct_total = mean_runtime / mean_total_runtime * 100
             # Record the node's type, name of the node, mean runtime, and
-            # percent runtim
+            # percent runtime.
             node_summaries.append(
                 [node.op, str(node), mean_runtime, pct_total])
 
@@ -214,7 +214,7 @@ def summary(self, should_sort : bool = False) -> str:
 ######################################################################
 # There are two things we should call out here:
 #
-# * MaxPool2d takes up the most time. This is a known issue:
+# * ``MaxPool2d`` takes up the most time. This is a known issue:
 #   https://github.com/pytorch/pytorch/issues/51393
 # * BatchNorm2d also takes up significant time. We can continue this
 #   line of thinking and optimize this in the Conv-BN Fusion with FX
@@ -226,7 +226,7 @@ def summary(self, should_sort : bool = False) -> str:
 # As we can see, using FX we can easily capture PyTorch programs (even
 # ones we don't have the source code for!) in a machine-interpretable
 # format and use that for analysis, such as the performance analysis
-# we've done here. FX opens up an exiciting world of possibilities for
+# we've done here. FX opens up an exciting world of possibilities for
 # working with PyTorch programs.
 #
 # Finally, since FX is still in beta, we would be happy to hear any
diff --git a/intermediate_source/jacobians_hessians.py b/intermediate_source/jacobians_hessians.py
index 3da8bda11f1..b8b96c30a3e 100644
--- a/intermediate_source/jacobians_hessians.py
+++ b/intermediate_source/jacobians_hessians.py
@@ -62,7 +62,7 @@ def compute_jac(xp):
 ######################################################################
 # Instead of computing the jacobian row-by-row, we can use PyTorch's
 # ``torch.vmap`` function transform to get rid of the for-loop and vectorize the
-# computation. We can’t directly apply vmap to ``torch.autograd.grad``;
+# computation. We can’t directly apply ``vmap`` to ``torch.autograd.grad``;
 # instead, PyTorch provides a ``torch.func.vjp`` transform that composes with
 # ``torch.vmap``:
 
@@ -76,15 +76,15 @@ def compute_jac(xp):
 assert torch.allclose(ft_jacobian, jacobian)
 
 ######################################################################
-# In a later tutorial a composition of reverse-mode AD and vmap will give us
+# In a later tutorial a composition of reverse-mode AD and ``vmap`` will give us
 # per-sample-gradients.
-# In this tutorial, composing reverse-mode AD and vmap gives us Jacobian
+# In this tutorial, composing reverse-mode AD and ``vmap`` gives us Jacobian
 # computation!
-# Various compositions of vmap and autodiff transforms can give us different
+# Various compositions of ``vmap`` and autodiff transforms can give us different
 # interesting quantities.
 #
 # PyTorch provides ``torch.func.jacrev`` as a convenience function that performs
-# the vmap-vjp composition to compute jacobians. ``jacrev`` accepts an argnums
+# the ``vmap-vjp`` composition to compute jacobians. ``jacrev`` accepts an ``argnums``
 # argument that says which argument we would like to compute Jacobians with
 # respect to.
 
@@ -92,7 +92,7 @@ def compute_jac(xp):
 
 ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x)
 
-# confirm
+# Confirm by running the following:
 assert torch.allclose(ft_jacobian, jacobian)
 
 ######################################################################
@@ -100,10 +100,10 @@ def compute_jac(xp):
 # The function transform version is much faster (and becomes even faster the
 # more outputs there are).
 #
-# In general, we expect that vectorization via vmap can help eliminate overhead
+# In general, we expect that vectorization via ``vmap`` can help eliminate overhead
 # and give better utilization of your hardware.
 #
-# vmap does this magic by pushing the outer loop down into the function's
+# ``vmap`` does this magic by pushing the outer loop down into the function's
 # primitive operations in order to obtain better performance.
 #
 # Let's make a quick function to evaluate performance and deal with
@@ -133,34 +133,34 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 print(with_vmap_timer)
 
 ######################################################################
-# Let's do a relative performance comparison of the above with our get_perf function:
+# Let's do a relative performance comparison of the above with our ``get_perf`` function:
 
 get_perf(no_vmap_timer, "without vmap",  with_vmap_timer, "vmap")
 
 ######################################################################
-# Furthemore, it’s pretty easy to flip the problem around and say we want to
+# Furthermore, it’s pretty easy to flip the problem around and say we want to
 # compute Jacobians of the parameters to our model (weight, bias) instead of the input
 
-# note the change in input via argnums params of 0,1 to map to weight and bias
+# note the change in input via ``argnums`` parameters of 0,1 to map to weight and bias
 ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x)
 
 ######################################################################
-# reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd)
-# --------------------------------------------------------------------
+# Reverse-mode Jacobian (``jacrev``) vs forward-mode Jacobian (``jacfwd``)
+# ------------------------------------------------------------------------
 #
 # We offer two APIs to compute jacobians: ``jacrev`` and ``jacfwd``:
 #
-# - jacrev uses reverse-mode AD. As you saw above it is a composition of our
-#   vjp and vmap transforms.
-# - jacfwd uses forward-mode AD. It is implemented as a composition of our
-#   jvp and vmap transforms.
+# - ``jacrev`` uses reverse-mode AD. As you saw above it is a composition of our
+#   ``vjp`` and ``vmap`` transforms.
+# - ``jacfwd`` uses forward-mode AD. It is implemented as a composition of our
+#   ``jvp`` and ``vmap`` transforms.
 #
-# jacfwd and jacrev can be substituted for each other but they have different
+# ``jacfwd`` and ``jacrev`` can be substituted for each other but they have different
 # performance characteristics.
 #
 # As a general rule of thumb, if you’re computing the jacobian of an :math:`R^N \to R^M`
-# function, and there are many more outputs than inputs (i.e. :math:`M > N`) then
-# jacfwd is preferred, otherwise use jacrev. There are exceptions to this rule,
+# function, and there are many more outputs than inputs (for example, :math:`M > N`) then
+# ``jacfwd`` is preferred, otherwise use ``jacrev``. There are exceptions to this rule,
 # but a non-rigorous argument for this follows:
 #
 # In reverse-mode AD, we are computing the jacobian row-by-row, while in
@@ -217,7 +217,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 print(f'jacrev time: {jacrev_timing}')
 
 #######################################################################
-# and a relative perf comparison:
+# and a relative performance comparison:
 
 get_perf(jacrev_timing, "jacrev", jacfwd_timing, "jacfwd")
 
@@ -228,7 +228,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 # Hessians are the jacobian of the jacobian (or the partial derivative of
 # the partial derivative, aka second order).
 #
-# This suggests that one can just compose functorch’s jacobian transforms to
+# This suggests that one can just compose functorch jacobian transforms to
 # compute the Hessian.
 # Indeed, under the hood, ``hessian(f)`` is simply ``jacfwd(jacrev(f))``.
 #
@@ -238,7 +238,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 
 from torch.func import hessian
 
-# lets reduce the size in order not to blow out colab. Hessians require
+# lets reduce the size in order not to overwhelm Colab. Hessians require
 # significant memory:
 Din = 512
 Dout = 32
@@ -251,8 +251,8 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 hess_revrev = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x)
 
 #######################################################################
-# Let's verify we have the same result regardless of using hessian api or
-# using jacfwd(jacfwd())
+# Let's verify we have the same result regardless of using hessian API or
+# using ``jacfwd(jacfwd())``.
 
 torch.allclose(hess_api, hess_fwdfwd)
 
@@ -265,7 +265,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 # shape ``(B, N)`` and a function that goes from :math:`R^N \to R^M`, we would like
 # a Jacobian of shape ``(B, M, N)``.
 #
-# The easiest way to do this is to use vmap:
+# The easiest way to do this is to use ``vmap``:
 
 batch_size = 64
 Din = 31
@@ -284,7 +284,7 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 #######################################################################
 # If you have a function that goes from (B, N) -> (B, M) instead and are
 # certain that each input produces an independent output, then it's also
-# sometimes possible to do this without using vmap by summing the outputs
+# sometimes possible to do this without using ``vmap`` by summing the outputs
 # and then computing the Jacobian of that function:
 
 def predict_with_output_summed(weight, bias, x):
@@ -295,10 +295,10 @@ def predict_with_output_summed(weight, bias, x):
 
 #######################################################################
 # If you instead have a function that goes from :math:`R^N \to R^M` but inputs that
-# are batched, you compose vmap with jacrev to compute batched jacobians:
+# are batched, you compose ``vmap`` with ``jacrev`` to compute batched jacobians:
 #
 # Finally, batch hessians can be computed similarly. It's easiest to think
-# about them by using vmap to batch over hessian computation, but in some
+# about them by using ``vmap`` to batch over hessian computation, but in some
 # cases the sum trick also works.
 
 compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0))
diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
index 4445704cd1b..21a5635c21b 100755
--- a/intermediate_source/mario_rl_tutorial.py
+++ b/intermediate_source/mario_rl_tutorial.py
@@ -3,10 +3,7 @@
 Train a Mario-playing RL Agent
 ================
 
-Authors: `Yuansong Feng <https://github.com/YuansongFeng>`__, `Suraj
-Subramanian <https://github.com/suraj813>`__, `Howard
-Wang <https://github.com/hw26>`__, `Steven
-Guo <https://github.com/GuoYuzhang>`__.
+**Authors:** `Yuansong Feng <https://github.com/YuansongFeng>`__, `Suraj Subramanian <https://github.com/suraj813>`__, `Howard Wang <https://github.com/hw26>`__, `Steven Guo <https://github.com/GuoYuzhang>`__.
 
 
 This tutorial walks you through the fundamentals of Deep Reinforcement
@@ -308,9 +305,9 @@ def act(self, state):
     Given a state, choose an epsilon-greedy action and update value of step.
 
     Inputs:
-    state(LazyFrame): A single observation of the current state, dimension is (state_dim)
+    state(``LazyFrame``): A single observation of the current state, dimension is (state_dim)
     Outputs:
-    action_idx (int): An integer representing which action Mario will perform
+    ``action_idx`` (``int``): An integer representing which action Mario will perform
     """
         # EXPLORE
         if np.random.rand() < self.exploration_rate:
@@ -359,11 +356,11 @@ def cache(self, state, next_state, action, reward, done):
         Store the experience to self.memory (replay buffer)
 
         Inputs:
-        state (LazyFrame),
-        next_state (LazyFrame),
-        action (int),
-        reward (float),
-        done(bool))
+        state (``LazyFrame``),
+        next_state (``LazyFrame``),
+        action (``int``),
+        reward (``float``),
+        done(``bool``))
         """
         def first_if_tuple(x):
             return x[0] if isinstance(x, tuple) else x
@@ -408,7 +405,7 @@ def recall(self):
 
 
 class MarioNet(nn.Module):
-    """mini cnn structure
+    """mini CNN structure
   input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
   """
 
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index af2842bf333..f08980265de 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -40,7 +40,7 @@
 
 N, C, H, W = 10, 3, 32, 32
 x = torch.empty(N, C, H, W)
-print(x.stride())  # Ouputs: (3072, 1024, 32, 1)
+print(x.stride())  # Outputs: (3072, 1024, 32, 1)
 
 ######################################################################
 # Conversion operator
@@ -56,11 +56,11 @@
 ######################################################################
 # Alternative option
 x = x.contiguous(memory_format=torch.channels_last)
-print(x.stride())  # Ouputs: (3072, 1, 96, 3)
+print(x.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
 # Format checks
-print(x.is_contiguous(memory_format=torch.channels_last))  # Ouputs: True
+print(x.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
 
 ######################################################################
 # There are minor difference between the two APIs ``to`` and
@@ -82,8 +82,8 @@
 # sizes are 1 in order to properly represent the intended memory
 # format
 special_x = torch.empty(4, 1, 4, 4)
-print(special_x.is_contiguous(memory_format=torch.channels_last))  # Ouputs: True
-print(special_x.is_contiguous(memory_format=torch.contiguous_format))  # Ouputs: True
+print(special_x.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
+print(special_x.is_contiguous(memory_format=torch.contiguous_format))  # Outputs: True
 
 ######################################################################
 # Same thing applies to explicit permutation API ``permute``. In
@@ -100,36 +100,36 @@
 ######################################################################
 # Create as channels last
 x = torch.empty(N, C, H, W, memory_format=torch.channels_last)
-print(x.stride())  # Ouputs: (3072, 1, 96, 3)
+print(x.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
 # ``clone`` preserves memory format
 y = x.clone()
-print(y.stride())  # Ouputs: (3072, 1, 96, 3)
+print(y.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
 # ``to``, ``cuda``, ``float`` ... preserves memory format
 if torch.cuda.is_available():
     y = x.cuda()
-    print(y.stride())  # Ouputs: (3072, 1, 96, 3)
+    print(y.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
 # ``empty_like``, ``*_like`` operators preserves memory format
 y = torch.empty_like(x)
-print(y.stride())  # Ouputs: (3072, 1, 96, 3)
+print(y.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
 # Pointwise operators preserves memory format
 z = x + y
-print(z.stride())  # Ouputs: (3072, 1, 96, 3)
+print(z.stride())  # Outputs: (3072, 1, 96, 3)
 
 ######################################################################
-# Conv, Batchnorm modules using cudnn backends support channels last
-# (only works for CudNN >= 7.6). Convolution modules, unlike binary
+# ``Conv``, ``Batchnorm`` modules using ``cudnn`` backends support channels last
+# (only works for cuDNN >= 7.6). Convolution modules, unlike binary
 # p-wise operator, have channels last as the dominating memory format.
-# IFF all inputs are in contiguous memory format, the operator
-# produces output in contiguous memory format. Otherwise, output wil
-# be in channels last memroy format.
+# If all inputs are in contiguous memory format, the operator
+# produces output in contiguous memory format. Otherwise, output will
+# be in channels last memory format.
 
 if torch.backends.cudnn.version() >= 7603:
     model = torch.nn.Conv2d(8, 4, 3).cuda().half()
@@ -139,7 +139,7 @@
     input = input.to(device="cuda", memory_format=torch.channels_last, dtype=torch.float16)
 
     out = model(input)
-    print(out.is_contiguous(memory_format=torch.channels_last))  # Ouputs: True
+    print(out.is_contiguous(memory_format=torch.channels_last))  # Outputs: True
 
 ######################################################################
 # When input tensor reaches a operator without channels last support,
@@ -152,13 +152,13 @@
 # Performance Gains
 # --------------------------------------------------------------------
 # Channels last memory format optimizations are available on both GPU and CPU.
-# On GPU, the most significant performance gains are observed on NVidia's
+# On GPU, the most significant performance gains are observed on NVIDIA's
 # hardware with Tensor Cores support running on reduced precision
 # (``torch.float16``).
-# We were able to archive over 22% perf gains with channels last
+# We were able to archive over 22% performance gains with channels last
 # comparing to contiguous format, both while utilizing
 # 'AMP (Automated Mixed Precision)' training scripts.
-# Our scripts uses AMP supplied by NVidia
+# Our scripts uses AMP supplied by NVIDIA
 # https://github.com/NVIDIA/apex.
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2  ./data``
@@ -196,7 +196,7 @@
 # Epoch: [0][80/125] Time 0.260 (0.335) Speed 770.324 (597.659) Loss 2.2505953312 (1.0879) Prec@1 50.500 (52.938) Prec@5 100.000 (100.000)
 
 ######################################################################
-# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% perf gain.
+# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% performance gain.
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data``
 
@@ -237,12 +237,12 @@
 # Epoch: [0][80/125] Time 0.198 (0.269) Speed 1011.827 (743.883) Loss 2.8196096420 (2.4011) Prec@1 47.500 (50.938) Prec@5 100.000 (100.000)
 
 ######################################################################
-# The following list of models has the full support of Channels last and showing 8%-35% perf gains on Volta devices:
+# The following list of models has the full support of Channels last and showing 8%-35% performance gains on Volta devices:
 # ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
 #
 
 ######################################################################
-# The following list of models has the full support of Channels last and showing 26%-76% perf gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs:
+# The following list of models has the full support of Channels last and showing 26%-76% performance gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs:
 # ``alexnet``, ``densenet121``, ``densenet161``, ``densenet169``, ``googlenet``, ``inception_v3``, ``mnasnet0_5``, ``mnasnet1_0``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext101_32x8d``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
 #
 
@@ -381,7 +381,7 @@ def attribute(m):
 # ----------
 # There are still many things to do, such as:
 #
-# - Resolving ambiguity of N1HW and NC11 Tensors;
+# - Resolving ambiguity of ``N1HW`` and ``NC11`` Tensors;
 # - Testing of Distributed Training support;
 # - Improving operators coverage.
 #
diff --git a/intermediate_source/mnist_train_nas.py b/intermediate_source/mnist_train_nas.py
index e3141e3d958..4ae6d894fce 100644
--- a/intermediate_source/mnist_train_nas.py
+++ b/intermediate_source/mnist_train_nas.py
@@ -1,5 +1,5 @@
 """
-Example training code for ax_multiobjective_nas_tutorial.py
+Example training code for ``ax_multiobjective_nas_tutorial.py``
 """
 
 import argparse
diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py
index 7e5cb84c499..d7a4da73371 100644
--- a/intermediate_source/model_parallel_tutorial.py
+++ b/intermediate_source/model_parallel_tutorial.py
@@ -259,11 +259,11 @@ def forward(self, x):
         ret = []
 
         for s_next in splits:
-            # A. s_prev runs on cuda:1
+            # A. ``s_prev`` runs on ``cuda:1``
             s_prev = self.seq2(s_prev)
             ret.append(self.fc(s_prev.view(s_prev.size(0), -1)))
 
-            # B. s_next runs on cuda:0, which can run concurrently with A
+            # B. ``s_next`` runs on ``cuda:0``, which can run concurrently with A
             s_prev = self.seq1(s_next).to('cuda:1')
 
         s_prev = self.seq2(s_prev)
@@ -339,7 +339,7 @@ def forward(self, x):
 # still opportunities to further accelerate the training process. For example,
 # all operations on ``cuda:0`` is placed on its default stream. It means that
 # computations on the next split cannot overlap with the copy operation of the
-# prev split. However, as prev and next splits are different tensors, there is
+# ``prev`` split. However, as ``prev`` and next splits are different tensors, there is
 # no problem to overlap one's computation with the other one's copy. The
 # implementation need to use multiple streams on both GPUs, and different
 # sub-network structures require different stream management strategies. As no

From 6cae5aeec15bae65d6c05dbd32a3061f38153a85 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Tue, 18 Apr 2023 15:58:10 -0700
Subject: [PATCH 2/3] Update mario_rl_tutorial.py

---
 intermediate_source/mario_rl_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
index 21a5635c21b..ff653d54c11 100755
--- a/intermediate_source/mario_rl_tutorial.py
+++ b/intermediate_source/mario_rl_tutorial.py
@@ -396,7 +396,7 @@ def recall(self):
 # In our implementation, we share feature generator ``features`` across
 # :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC
 # classifiers for each. :math:`\theta_{target}` (the parameters of
-# :math:`Q_{target}`) is frozen to prevent updation by backprop. Instead,
+# :math:`Q_{target}`) is frozen to prevent updating by backprop. Instead,
 # it is periodically synced with :math:`\theta_{online}` (more on this
 # later).
 #

From 7f7e62e8a34581115c7bbc8f5dca55fb3dea57ff Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Tue, 18 Apr 2023 15:58:31 -0700
Subject: [PATCH 3/3] Update en-wordlist.txt

---
 en-wordlist.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/en-wordlist.txt b/en-wordlist.txt
index 7fd34f1ee56..9a4a99d1df2 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -278,7 +278,6 @@ unfused
 unimodal
 unnormalized
 unpickling
-updation
 utils
 vectorization
 vectorize