From 3aaa3d24a6960e81d1ef0e492bca263cb7a3d1df Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Tue, 18 Apr 2023 15:54:01 -0700
Subject: [PATCH 1/7] Pyspelling: intermediate Python tutorials N-Z

---
 .pyspelling.yml                               | 37 ++++++++------
 en-wordlist.txt                               | 25 ++++++++++
 intermediate_source/neural_tangent_kernels.py | 12 ++---
 intermediate_source/nvfuser_intro_tutorial.py | 50 +++++++++----------
 intermediate_source/parametrizations.py       | 10 ++--
 intermediate_source/per_sample_grads.py       | 20 ++++----
 intermediate_source/pipeline_tutorial.py      | 14 +++---
 7 files changed, 99 insertions(+), 69 deletions(-)
diff --git a/.pyspelling.yml b/.pyspelling.yml
index 9dce7c8215a..598ce7698df 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -2,22 +2,27 @@ spellchecker: aspell
 matrix:
 - name: python
   sources:
-    - beginner_source/*.py
-    - intermediate_source/autograd_saved_tensors_hooks_tutorial.py
-    - intermediate_source/ax_multiobjective_nas_tutorial.py
-    - intermediate_source/char_rnn_classification_tutorial.py
-    - intermediate_source/char_rnn_generation_tutorial.py
-    - intermediate_source/custom_function_conv_bn_tutorial.py
-    - intermediate_source/ensembling.py
+    #- beginner_source/*.py
+    #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py
+    #- intermediate_source/ax_multiobjective_nas_tutorial.py
+    #- intermediate_source/char_rnn_classification_tutorial.py
+    #- intermediate_source/char_rnn_generation_tutorial.py
+    #- intermediate_source/custom_function_conv_bn_tutorial.py
+    #- intermediate_source/ensembling.py
     #- intermediate_source/flask_rest_api_tutorial.py
-    - intermediate_source/forward_ad_usage.py
-    - intermediate_source/fx_conv_bn_fuser.py
-    - intermediate_source/fx_profiling_tutorial.py
-    - intermediate_source/jacobians_hessians.py
-    - intermediate_source/mario_rl_tutorial.py
-    - intermediate_source/mnist_train_nas.py
-    - intermediate_source/memory_format_tutorial.py
-    - intermediate_source/model_parallel_tutorial.py
+    #- intermediate_source/forward_ad_usage.py
+    #- intermediate_source/fx_conv_bn_fuser.py
+    #- intermediate_source/fx_profiling_tutorial.py
+    #- intermediate_source/jacobians_hessians.py
+    #- intermediate_source/mario_rl_tutorial.py
+    #- intermediate_source/mnist_train_nas.py
+    #- intermediate_source/memory_format_tutorial.py
+    #- intermediate_source/model_parallel_tutorial.py
+    #- intermediate_source/neural_tangent_kernels.py
+    #- intermediate_source/nvfuser_intro_tutorial.py
+    #- intermediate_source/parametrizations.py
+    #- intermediate_source/per_sample_grads.py
+    - intermediate_source/pipeline_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt
@@ -46,7 +51,7 @@ matrix:
         - open: '(?s)^::\n\n  '
           close: '^\n'
         # Ignore reStructuredText block directives
-        - open: '\.\. (code-block)::.*$\n*'
+        - open: '\.\. (code-block|math)::.*$\n*'
           content: '(?P<first>(^(?P<indent>[ ]+).*$\n))(?P<other>(^([ \t]+.*|[ \t]*)$\n)*)'
           close: '(^(?![ \t]+.*$))'
   - pyspelling.filters.markdown:
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 9a4a99d1df2..2c9f5e270c8 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -1,3 +1,19 @@
+RPC
+multihead
+GPU's
+Lipschitz
+Frobenius
+reimplement
+reimplements
+reimplementing
+parametrizing
+unparametrized
+submodules
+SPD
+Cayley
+parametrization
+parametrized
+parametrizations
 APIs
 Args
 Autograd
@@ -38,6 +54,7 @@ GANs
 GPUs
 GRU
 GRUs
+GTC
 GeForce
 Goodfellow
 Goodfellow’s
@@ -69,6 +86,7 @@ NAS
 NCHW
 NES
 NLP
+NTK
 NaN
 NeurIPS
 NumPy
@@ -161,6 +179,7 @@ finetuning
 fp
 functorch
 fuser
+geomean
 grayscale
 hardcode
 helpdesk
@@ -204,6 +223,8 @@ ndarrays
 num
 numericalize
 numpy
+nvFuser
+nvFuser's
 optimizable
 optimizer's
 optimizers
@@ -213,6 +234,7 @@ parallelization
 perceptibility
 pipelining
 pointwise
+precompute
 precomputing
 prepend
 preprocess
@@ -229,6 +251,7 @@ quantizing
 queryable
 randint
 readably
+recomputation
 reinitializes
 relu
 reproducibility
@@ -262,6 +285,7 @@ timesteps
 tokenization
 tokenize
 tokenizer
+topologies
 torchaudio
 torchdata
 torchscriptable
@@ -278,6 +302,7 @@ unfused
 unimodal
 unnormalized
 unpickling
+updation
 utils
 vectorization
 vectorize
diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py
index 5d897bfa31f..ca1de89daf1 100644
--- a/intermediate_source/neural_tangent_kernels.py
+++ b/intermediate_source/neural_tangent_kernels.py
@@ -58,7 +58,7 @@ def forward(self, x):
 # we will need a function that accepts the parameters of the model and a single
 # input (as opposed to a batch of inputs!) and returns a single output.
 #
-# We'll use ``torch.func.functional_call``, which allows us to call an nn.Module
+# We'll use ``torch.func.functional_call``, which allows us to call an ``nn.Module``
 # using different parameters/buffers, to help accomplish the first step.
 #
 # Keep in mind that the model was originally written to accept a batch of input
@@ -200,10 +200,10 @@ def func_x2(params):
         output, vjp_fn = vjp(func_x1, params)
 
         def get_ntk_slice(vec):
-            # This computes vec @ J(x2).T
+            # This computes ``vec @ J(x2).T``
             # `vec` is some unit vector (a single slice of the Identity matrix)
             vjps = vjp_fn(vec)
-            # This computes J(X1) @ vjps
+            # This computes ``J(X1) @ vjps``
             _, jvps = jvp(func_x2, (params,), vjps)
             return jvps
 
@@ -211,10 +211,10 @@ def get_ntk_slice(vec):
         basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1)
         return vmap(get_ntk_slice)(basis)
 
-    # get_ntk(x1, x2) computes the NTK for a single data point x1, x2
-    # Since the x1, x2 inputs to empirical_ntk_ntk_vps are batched,
+    # ``get_ntk(x1, x2)`` computes the NTK for a single data point x1, x2
+    # Since the x1, x2 inputs to ``empirical_ntk_ntk_vps`` are batched,
     # we actually wish to compute the NTK between every pair of data points
-    # between {x1} and {x2}. That's what the vmaps here do.
+    # between {x1} and {x2}. That's what the ``vmaps`` here do.
     result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2)
 
     if compute == 'full':
diff --git a/intermediate_source/nvfuser_intro_tutorial.py b/intermediate_source/nvfuser_intro_tutorial.py
index 91166fcce1e..155c1471a72 100644
--- a/intermediate_source/nvfuser_intro_tutorial.py
+++ b/intermediate_source/nvfuser_intro_tutorial.py
@@ -71,7 +71,7 @@
 # networks, so improving the speed of these operations can improve
 # overall network training speed. Future releases of nvFuser will
 # improve the performance of Linear Layers, but for now we will
-# specifically look at the Bias-Dropout-Add-LayerNorm section of this
+# specifically look at the ``Bias-Dropout-Add-LayerNorm`` section of this
 # Transformer Block.
 #
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_transformer_block.png
@@ -154,7 +154,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
         # Run model, forward and backward
         output = forward_func()
         output.backward(grad_output)
-        # delete gradiens to avoid profiling the gradient accumulation
+        # delete gradients to avoid profiling the gradient accumulation
         for p in parameters:
             p.grad = None
 
@@ -165,7 +165,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
         # Run model, forward and backward
         output = forward_func()
         output.backward(grad_output)
-        # delete gradiens to avoid profiling the gradient accumulation
+        # delete gradients to avoid profiling the gradient accumulation
         for p in parameters:
             p.grad = None
 
@@ -265,7 +265,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 # nvFuser took around 2.4s in total to compile these high speed
 # GPU functions.
 #
-# nvFuser’s capabilities extend well beyond this initial performance gain.
+# nvFuser's capabilities extend well beyond this initial performance gain.
 #
 
 ######################################################################
@@ -281,7 +281,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 # To use nvFuser on inputs that change shape from iteration, we
 # generate new input and output gradient tensors and make a few
 # different sizes. Since the last dimension is shared with the
-# parameters and cannot be changed dynamically in LayerNorm, we
+# parameters and cannot be changed dynamically in ``LayerNorm``, we
 # perturb the first two dimensions of the input and gradient tensors.
 #
 
@@ -390,7 +390,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 #
 
 ######################################################################
-# Defining novel operations with nvFuser and FuncTorch
+# Defining novel operations with nvFuser and functorch
 # ----------------------------------------------------
 #
 # One of the primary benefits of nvFuser is the ability to define
@@ -398,8 +398,8 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 # just-in-time compiled into efficient kernels.
 #
 # PyTorch has strong performance for any individual operation,
-# especially composite operations like LayerNorm. However, if
-# LayerNorm wasn’t already implemented in PyTorch as a composite
+# especially composite operations like ``LayerNorm``. However, if
+# ``LayerNorm`` wasn’t already implemented in PyTorch as a composite
 # operation, then you’d have to define it as a series of simpler
 # (primitive) operations. Let’s make such a definition and run it
 # without nvFuser.
@@ -488,7 +488,7 @@ def primitive_definition(
 #
 # However, the performance is still slower than the original eager
 # mode performance of the composite definition. TorchScript works well
-# when predefined composite operations are used, however TorchScript’s
+# when predefined composite operations are used, however TorchScript
 # application of Autograd saves all of the activations for each
 # operator in the fusion for re-use in the backwards pass. However,
 # this is not typically the optimal choice. Especially when chaining
@@ -499,7 +499,7 @@ def primitive_definition(
 # It’s possible to optimize away many of these unnecessary memory
 # accesses, but it requires building a connected forward and backward
 # graph which isn’t possible with TorchScript. The
-# `memory_efficient_fusion` pass in FuncTorch, however, is such an
+# ``memory_efficient_fusion`` pass in functorch, however, is such an
 # optimization pass. To use this pass, we have to redefine our
 # function to pull the constants inside (for now it’s easiest to make
 # non-tensor constants literals in the function definition):
@@ -527,11 +527,11 @@ def primitive_definition_for_memory_efficient_fusion(
 
 ######################################################################
 # Now, instead of passing our function to TorchScript, we will pass it
-# to FuncTorch’s optimization pass.
+# to functorch optimization pass.
 #
 
 
-# Optimize the model with FuncTorch tracing and the memory efficiency
+# Optimize the model with functorch tracing and the memory efficiency
 # optimization pass
 memory_efficient_primitive_definition = memory_efficient_fusion(
     primitive_definition_for_memory_efficient_fusion
@@ -550,22 +550,22 @@ def primitive_definition_for_memory_efficient_fusion(
 
 ######################################################################
 # This recovers even more speed, but it’s still not as fast as
-# TorchScripts original performance with the composite definition.
+# TorchScript original performance with the composite definition.
 # However, this is still faster than running this new definition
 # without nvFuser, and is still faster than the composite definition
 # without nvFuser.
 #
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_5.png
 #
-# .. note:: FuncTorch’s memory efficient pass is experimental and still
+# .. note:: The functorch memory efficient pass is experimental and still
 #           actively in development.
 #           Future versions of the API are expected to achieve performance
 #           closer to that of TorchScript with the composite definition.
 #
-# .. note:: FuncTorch’s memory efficient pass specializes on the shapes of
+# .. note:: The functorch memory efficient pass specializes on the shapes of
 #           the inputs to the function. If new inputs are provided with
 #           different shapes, then you need to construct a new function
-#           using `memory_efficient_fusion` and apply it to the new inputs.
+#           using ``memory_efficient_fusion`` and apply it to the new inputs.
 
 
 ######################################################################
@@ -577,10 +577,10 @@ def primitive_definition_for_memory_efficient_fusion(
 # an entirely new operation in PyTorch – which takes a lot of time and
 # knowledge of the lower-level PyTorch code as well as parallel
 # programming – or writing the operation in simpler PyTorch ops and
-# settling for poor performance. For example, let's replace LayerNorm
-# in our example with RMSNorm. Even though RMSNorm is a bit simpler
-# than LayerNorm, it doesn’t have an existing compound operation in
-# PyTorch. See the `Root Mean Square Layer Normalization <https://doi.org/10.48550/arXiv.1910.07467>`__ paper for more information about RMSNorm.
+# settling for poor performance. For example, let's replace ``LayerNorm``
+# in our example with ``RMSNorm``. Even though ``RMSNorm`` is a bit simpler
+# than ``LayerNorm``, it doesn’t have an existing compound operation in
+# PyTorch. See the `Root Mean Square Layer Normalization <https://doi.org/10.48550/arXiv.1910.07467>`__ paper for more information about ``RMSNorm``.
 # As before, we’ll define our new transformer block with
 # primitive PyTorch operations.
 #
@@ -608,7 +608,7 @@ def with_rms_norm(
 # As before, we’ll get a baseline by running PyTorch without nvFuser.
 #
 
-# Profile rms_norm
+# Profile ``rms_norm``
 func = functools.partial(
     with_rms_norm,
     input1,
@@ -625,7 +625,7 @@ def with_rms_norm(
 # With nvFuser through TorchScript.
 #
 
-# Profile scripted rms_norm
+# Profile scripted ``rms_norm``
 scripted_with_rms_norm = torch.jit.script(with_rms_norm)
 func = functools.partial(
     scripted_with_rms_norm,
@@ -656,7 +656,7 @@ def with_rms_norm_for_memory_efficient_fusion(
     return norm_output
 
 
-# Profile memory efficient rms_norm
+# Profile memory efficient ``rms_norm``
 memory_efficient_rms_norm = memory_efficient_fusion(
     with_rms_norm_for_memory_efficient_fusion
 )
@@ -666,12 +666,12 @@ def with_rms_norm_for_memory_efficient_fusion(
 ######################################################################
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_6.png
 #
-# Since RMSNorm is simpler than LayerNorm the performance of our new
+# Since ``RMSNorm`` is simpler than ``LayerNorm`` the performance of our new
 # transformer block is a little higher than the primitive definition
 # without nvFuser (354 iterations per second compared with 260
 # iterations per second). With TorchScript, the iterations per second
 # increases by 2.68x and 3.36x to 952 iterations per second and 1,191
-# iterations per second with TorchScript and FuncTorch’s memory
+# iterations per second with TorchScript and functorch memory
 # efficient optimization pass, respectively. The performance of this
 # new operation nearly matches the performance of the composite Layer
 # Norm definition with TorchScript.
diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py
index 0f71a0aafe6..086a4300674 100644
--- a/intermediate_source/parametrizations.py
+++ b/intermediate_source/parametrizations.py
@@ -19,7 +19,7 @@
 This approach proposes to decouple the learning of the parameters from the
 learning of their norms.  To do so, the parameter is divided by its
 `Frobenius norm <https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm>`_
-and a separate parameter encoding its norm is learnt.
+and a separate parameter encoding its norm is learned.
 A similar regularization was proposed for GANs under the name of
 "`spectral normalization <https://pytorch.org/docs/stable/generated/torch.nn.utils.spectral_norm.html>`_". This method
 controls the Lipschitz constant of the network by dividing its parameters by
@@ -84,7 +84,7 @@ def forward(self, x):
 # 2) It does not separate the layer and the parametrization.  If the parametrization were
 #    more difficult, we would have to rewrite its code for each layer that we want to use it
 #    in.
-# 3) It recomputes the parametrization everytime we use the layer. If we use the layer
+# 3) It recomputes the parametrization every time we use the layer. If we use the layer
 #    several times during the forward pass, (imagine the recurrent kernel of an RNN), it
 #    would compute the same ``A`` every time that the layer is called.
 #
@@ -258,8 +258,8 @@ def forward(self, X):
 print((torch.symeig(X).eigenvalues > 0.).all())  # X is positive definite
 
 ###############################################################################
-# Intializing parametrizations
-# ----------------------------
+# Initializing parametrizations
+# -----------------------------
 #
 # Parametrizations come with a mechanism to initialize them. If we implement a method
 # ``right_inverse`` with signature
@@ -327,7 +327,7 @@ def right_inverse(self, A):
 ###############################################################################
 # The name of this method comes from the fact that we would often expect
 # that ``forward(right_inverse(X)) == X``. This is a direct way of rewriting that
-# the forward afer the initalization with value ``X`` should return the value ``X``.
+# the forward after the initialization with value ``X`` should return the value ``X``.
 # This constraint is not strongly enforced in practice. In fact, at times, it might be of
 # interest to relax this relation. For example, consider the following implementation
 # of a randomized pruning method:
diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py
index 9d2c774e9fc..c423679229c 100644
--- a/intermediate_source/per_sample_grads.py
+++ b/intermediate_source/per_sample_grads.py
@@ -70,7 +70,7 @@ def loss_fn(predictions, targets):
 predictions = model(data)  # move the entire mini-batch through the model
 
 loss = loss_fn(predictions, targets)
-loss.backward()  # back propogate the 'average' gradient of this mini-batch
+loss.backward()  # back propagate the 'average' gradient of this mini-batch
 
 ######################################################################
 # In contrast to the above approach, per-sample-gradient computation is
@@ -114,7 +114,7 @@ def compute_sample_grads(data, targets):
 # Our strategy is to define a function that computes the loss and then apply
 # transforms to construct a function that computes per-sample-gradients.
 #
-# We'll use the ``torch.func.functional_call`` function to treat an nn.Module
+# We'll use the ``torch.func.functional_call`` function to treat an ``nn.Module``
 # like a function.
 #
 # First, let’s extract the state from ``model`` into two dictionaries,
@@ -146,16 +146,16 @@ def compute_loss(params, buffers, sample, target):
 ######################################################################
 # Now, let’s use the ``grad`` transform to create a new function that computes
 # the gradient with respect to the first argument of ``compute_loss``
-# (i.e. the params).
+# (i.e. the ``params``).
 
 ft_compute_grad = grad(compute_loss)
 
 ######################################################################
 # The ``ft_compute_grad`` function computes the gradient for a single
-# (sample, target) pair. We can use vmap to get it to compute the gradient
+# (sample, target) pair. We can use ``vmap`` to get it to compute the gradient
 # over an entire batch of samples and targets. Note that
 # ``in_dims=(None, None, 0, 0)`` because we wish to map ``ft_compute_grad`` over
-# the 0th dimension of the data and targets, and use the same params and
+# the 0th dimension of the data and targets, and use the same ``params`` and
 # buffers for each.
 
 ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
@@ -174,16 +174,16 @@ def compute_loss(params, buffers, sample, target):
 
 ######################################################################
 # A quick note: there are limitations around what types of functions can be
-# transformed by vmap. The best functions to transform are ones that are pure
+# transformed by ``vmap``. The best functions to transform are ones that are pure
 # functions: a function where the outputs are only determined by the inputs,
-# and that have no side effects (e.g. mutation). vmap is unable to handle
+# and that have no side effects (e.g. mutation). ``vmap`` is unable to handle
 # mutation of arbitrary Python data structures, but it is able to handle many
 # in-place PyTorch operations.
 #
 # Performance comparison
 # ----------------------
 #
-# Curious about how the performance of vmap compares?
+# Curious about how the performance of ``vmap`` compares?
 #
 # Currently the best results are obtained on newer GPU's such as the A100
 # (Ampere) where we've seen up to 25x speedups on this example, but here are
@@ -218,9 +218,9 @@ def get_perf(first, first_descriptor, second, second_descriptor):
 # the naive method. But it’s cool that composing ``vmap`` and ``grad`` give us a
 # nice speedup.
 #
-# In general, vectorization with vmap should be faster than running a function
+# In general, vectorization with ``vmap`` should be faster than running a function
 # in a for-loop and competitive with manual batching. There are some exceptions
-# though, like if we haven’t implemented the vmap rule for a particular
+# though, like if we haven’t implemented the ``vmap`` rule for a particular
 # operation or if the underlying kernels weren’t optimized for older hardware
 # (GPUs). If you see any of these cases, please let us know by opening an issue
 # at on GitHub.
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
index bdd6cabb3f2..398d3cdf28a 100644
--- a/intermediate_source/pipeline_tutorial.py
+++ b/intermediate_source/pipeline_tutorial.py
@@ -35,7 +35,7 @@
 # As a result, our focus is on ``nn.TransformerEncoder`` and we split the model
 # such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the
 # other half are on another. To do this, we pull out the ``Encoder`` and
-# ``Decoder`` sections into seperate modules and then build an nn.Sequential
+# ``Decoder`` sections into seperate modules and then build an ``nn.Sequential``
 # representing the original Transformer module.
 
 import sys
@@ -172,11 +172,11 @@ def data_process(raw_text_iter):
 device = torch.device("cuda")
 
 def batchify(data, bsz):
-    # Divide the dataset into bsz parts.
-    nbatch = data.size(0) // bsz
+    # Divide the dataset into ``bsz`` parts.
+    nbatch = data.size(0) // ``bsz``
     # Trim off any extra elements that wouldn't cleanly fit (remainders).
     data = data.narrow(0, 0, nbatch * bsz)
-    # Evenly divide the data across the bsz batches.
+    # Evenly divide the data across the ``bsz` batches.
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
@@ -245,9 +245,9 @@ def get_batch(source, i):
 
 ntokens = len(vocab) # the size of vocabulary
 emsize = 4096 # embedding dimension
-nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder
-nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
-nhead = 16 # the number of heads in the multiheadattention models
+nhid = 4096 # the dimension of the feedforward network model in ``nn.TransformerEncoder``
+nlayers = 12 # the number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
+nhead = 16 # the number of heads in the Multihead Attention models
 dropout = 0.2 # the dropout value
 
 from torch.distributed import rpc

From 675e9e49ee1397e2f7af9523174b9b1d7c8a3d60 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Wed, 19 Apr 2023 09:01:55 -0700
Subject: [PATCH 2/7] Update

---
 .pyspelling.yml                          |  3 ++-
 en-wordlist.txt                          |  6 ++++++
 intermediate_source/pipeline_tutorial.py | 23 ++++++++++++-----------
 intermediate_source/pruning_tutorial.py  |  4 ++--
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/.pyspelling.yml b/.pyspelling.yml
index 598ce7698df..570785eee7e 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -22,7 +22,8 @@ matrix:
     #- intermediate_source/nvfuser_intro_tutorial.py
     #- intermediate_source/parametrizations.py
     #- intermediate_source/per_sample_grads.py
-    - intermediate_source/pipeline_tutorial.py
+    #- intermediate_source/pipeline_tutorial.py
+    - intermediate_source/pruning_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 2c9f5e270c8..9e88bc1f7bf 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -1,3 +1,9 @@
+subnetworks
+sparsify
+LeCun
+prepruned
+dimensionality
+unpruned
 RPC
 multihead
 GPU's
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
index 398d3cdf28a..444eddf8415 100644
--- a/intermediate_source/pipeline_tutorial.py
+++ b/intermediate_source/pipeline_tutorial.py
@@ -35,7 +35,7 @@
 # As a result, our focus is on ``nn.TransformerEncoder`` and we split the model
 # such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the
 # other half are on another. To do this, we pull out the ``Encoder`` and
-# ``Decoder`` sections into seperate modules and then build an ``nn.Sequential``
+# ``Decoder`` sections into separate modules and then build an ``nn.Sequential``
 # representing the original Transformer module.
 
 import sys
@@ -134,16 +134,17 @@ def forward(self, x):
 # length 6:
 #
 # .. math::
-#   \begin{bmatrix}
-#   \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
-#   \end{bmatrix}
-#   \Rightarrow
-#   \begin{bmatrix}
-#   \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
-#   \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
-#   \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
-#   \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
-#   \end{bmatrix}
+#
+#    \begin{bmatrix}
+#    \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
+#    \end{bmatrix}
+#    \Rightarrow
+#    \begin{bmatrix}
+#    \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
+#    \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
+#    \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
+#    \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
+#    \end{bmatrix}
 #
 # These columns are treated as independent by the model, which means that
 # the dependence of ``G`` and ``F`` can not be learned, but allows more
diff --git a/intermediate_source/pruning_tutorial.py b/intermediate_source/pruning_tutorial.py
index d8de5a7502a..ba6701c8c35 100644
--- a/intermediate_source/pruning_tutorial.py
+++ b/intermediate_source/pruning_tutorial.py
@@ -339,8 +339,8 @@ def forward(self, x):
 # pruning this technique implements (supported options are ``global``,
 # ``structured``, and ``unstructured``). This is needed to determine
 # how to combine masks in the case in which pruning is applied
-# iteratively. In other words, when pruning a pre-pruned parameter,
-# the current prunining techique is expected to act on the unpruned
+# iteratively. In other words, when pruning a prepruned parameter,
+# the current pruning technique is expected to act on the unpruned
 # portion of the parameter. Specifying the ``PRUNING_TYPE`` will
 # enable the ``PruningContainer`` (which handles the iterative
 # application of pruning masks) to correctly identify the slice of the

From 38a94d58274425dbb202195afdf6c89cc35b2b65 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Apr 2023 15:29:26 -0700
Subject: [PATCH 3/7] Update

---
 .pyspelling.yml                               |  19 +-
 1                                             | 341 ++++++++++++++++++
 en-wordlist.txt                               |  90 +++--
 .../flask_rest_api_tutorial.py                |   6 +-
 intermediate_source/reinforcement_ppo.py      |  48 +--
 .../reinforcement_q_learning.py               |  18 +-
 .../scaled_dot_product_attention_tutorial.py  |  39 +-
 .../seq2seq_translation_tutorial.py           |  12 +-
 .../tensorboard_profiler_tutorial.py          |  16 +-
 intermediate_source/torch_compile_tutorial.py |   6 +-
 10 files changed, 499 insertions(+), 96 deletions(-)
 create mode 100644 1

diff --git a/.pyspelling.yml b/.pyspelling.yml
index 570785eee7e..bce94f383cc 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -2,7 +2,8 @@ spellchecker: aspell
 matrix:
 - name: python
   sources:
-    #- beginner_source/*.py
+    - beginner_source/*.py
+    - intermediate_source/*.py
     #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py
     #- intermediate_source/ax_multiobjective_nas_tutorial.py
     #- intermediate_source/char_rnn_classification_tutorial.py
@@ -23,7 +24,14 @@ matrix:
     #- intermediate_source/parametrizations.py
     #- intermediate_source/per_sample_grads.py
     #- intermediate_source/pipeline_tutorial.py
-    - intermediate_source/pruning_tutorial.py
+    #- intermediate_source/pruning_tutorial.py
+    #- intermediate_source/reinforcement_ppo.py
+    #- intermediate_source/reinforcement_q_learning.py
+    #- intermediate_source/scaled_dot_product_attention_tutorial.py
+    #- intermediate_source/seq2seq_translation_tutorial.py
+    #- intermediate_source/spatial_transformer_tutorial.py
+    #- intermediate_source/tensorboard_profiler_tutorial.py
+    #- intermediate_source/torch_compile_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt
@@ -36,9 +44,16 @@ matrix:
         # Exclude figure rST tags
         - open: '\.\.\s+(figure|literalinclude|math|image|grid)::'
           close: '\n'
+        # Exclude roles:
+        - open: ':(?:(class|py:mod|mod|func)):`'
+          content: '[^`]*'
+          close: '`'
         # Exclude raw directive
         - open: '\.\. (raw)::.*$\n*'
           close: '\n'
+        # Exclude
+        - open: '.*(:py:mod:).*'
+          close: ' '
         # Exclude Python coding directives
         - open: '-\*- coding:'
           close: '\n'
diff --git a/1 b/1
new file mode 100644
index 00000000000..996a909cd88
--- /dev/null
+++ b/1
@@ -0,0 +1,341 @@
+Andrej
+Karpathy's
+NanoGPT
+compilable
+decorrelated
+DQN
+deterministically
+approximators
+duration
+CartPole
+EPS
+APIs
+Args
+Autograd
+BCE
+BN
+BOS
+Bahdanau
+BatchNorm
+CHW
+CIFAR
+CLS
+CNNDM
+CNNs
+CPUs
+CUDA
+Cayley
+Chatbots
+Colab
+Conv
+ConvNet
+ConvNets
+DCGAN
+DCGANs
+DDQN
+DNN
+DataLoaders
+DeepMind
+DeiT
+DenseNet
+EOS
+FC
+FGSM
+FLAVA
+FX
+FX's
+FloydHub
+FloydHub's
+Frobenius
+GAE
+GAN
+GANs
+GPU's
+GPUs
+GRU
+GRUs
+GTC
+GeForce
+Goodfellow
+Goodfellow’s
+GreedySearchDecoder
+HVP
+Hugging Face
+IMDB
+ImageNet
+Initializations
+Iteratively
+JSON
+JVP
+Jacobian
+Kiuk
+Kubernetes
+Kuei
+LSTM
+LSTMs
+LeCun
+LeNet
+LeakyReLU
+LeakyReLUs
+Lipschitz
+Lua
+Luong
+MLP
+MLPs
+MNIST
+Mypy
+NAS
+NCHW
+NES
+NLP
+NTK
+NaN
+NeurIPS
+NumPy
+Numericalization
+Numpy's
+OpenAI
+PPO
+Plotly
+Prec
+Profiler
+PyTorch's
+RGB
+RL
+RNN
+RNNs
+RPC
+RTX
+Radford
+ReLU
+ResNet
+SPD
+SST2
+Sequentials
+Sigmoid
+SoTA
+TPU
+TensorBoard
+TextVQA
+Tokenization
+TorchMultimodal
+TorchRL
+TorchRL's
+TorchScript
+TorchX
+Tunable
+Unescape
+VQA
+Wikitext
+Xeon
+accuracies
+activations
+adversarially
+al
+autodiff
+autograd
+backend
+backends
+backprop
+backpropagate
+backpropagated
+backpropagates
+backpropagation
+batchnorm
+batchnorm's
+benchmarking
+boolean
+broadcasted
+cardinality
+chatbot
+chatbot's
+checkpointing
+composable
+concat
+config
+contrastive
+conv
+convolutional
+cpu
+csv
+cuDNN
+datafile
+dataframe
+dataloader
+dataloaders
+datapipes
+dataset
+datasets
+dataset’s
+deserialize
+deserialized
+dimensionality
+dir
+downsample
+downsamples
+embeddings
+encodings
+ensembling
+eq
+et
+evaluateInput
+extensibility
+fastai
+fbgemm
+feedforward
+finetune
+finetuning
+fp
+functorch
+fuser
+geomean
+grayscale
+hardcode
+helpdesk
+helpdesks
+hessian
+hessians
+hvp
+hyperparameter
+hyperparameters
+imagenet
+initializations
+inlined
+interpretable
+io
+iterable
+iteratively
+jacobian
+jacobians
+jit
+jpg
+kwargs
+labelled
+learnable
+learnings
+loadFilename
+manualSeed
+matplotlib
+minibatch
+minibatches
+minimax
+misclassification
+misclassified
+modularity
+modularized
+multihead
+multimodal
+multimodality
+multiobjective
+multiprocessed
+multithreaded
+namespace
+natively
+ndarrays
+num
+numericalize
+numpy
+nvFuser
+nvFuser's
+optimizable
+optimizer's
+optimizers
+overfitting
+parallelizable
+parallelization
+parametrization
+parametrizations
+parametrized
+parametrizing
+perceptibility
+pipelining
+pointwise
+precompute
+precomputing
+prepend
+preprocess
+preprocessing
+prepruned
+prespecified
+pretrained
+prewritten
+primals
+profiler
+profilers
+pytorch
+quantized
+quantizing
+queryable
+randint
+readably
+recomputation
+reimplement
+reimplementing
+reimplements
+reinitializes
+relu
+reproducibility
+rescale
+resnet
+restride
+rewinded
+rollout
+romanized
+runnable
+runtime
+runtime
+runtimes
+scalable
+softmax
+sparsify
+specificities
+src
+stacktrace
+stateful
+storages
+strided
+subclasses
+subclassing
+subdirectories
+submodule
+submodules
+subnetworks
+subreddit
+summarization
+tanh
+th
+thresholding
+timestep
+timesteps
+tokenization
+tokenize
+tokenizer
+topologies
+torchaudio
+torchdata
+torchscriptable
+torchtext
+torchtext's
+torchvision
+torchviz
+traceback
+tradeoff
+tradeoffs
+uncomment
+uncommented
+unfused
+unimodal
+unnormalized
+unparametrized
+unpickling
+unpruned
+updation
+utils
+vectorization
+vectorize
+vectorized
+vhp
+voc
+walkthrough
+warmstart
+warmstarting
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 9e88bc1f7bf..d8ddb614157 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -1,26 +1,10 @@
-subnetworks
-sparsify
-LeCun
-prepruned
-dimensionality
-unpruned
-RPC
-multihead
-GPU's
-Lipschitz
-Frobenius
-reimplement
-reimplements
-reimplementing
-parametrizing
-unparametrized
-submodules
-SPD
-Cayley
-parametrization
-parametrized
-parametrizations
+UI
+bytecode
+TorchInductor
+unoptimized
+TorchDynamo
 APIs
+ATen
 Args
 Autograd
 BCE
@@ -35,6 +19,8 @@ CNNDM
 CNNs
 CPUs
 CUDA
+CartPole
+Cayley
 Chatbots
 Colab
 Conv
@@ -42,12 +28,16 @@ ConvNet
 ConvNets
 DCGAN
 DCGANs
+DDP
 DDQN
 DNN
+DQN
 DataLoaders
+DeepMind
 DeiT
 DenseNet
 EOS
+EPS
 FC
 FGSM
 FLAVA
@@ -55,8 +45,12 @@ FX
 FX's
 FloydHub
 FloydHub's
+Frobenius
+GAE
 GAN
 GANs
+GLOO
+GPU's
 GPUs
 GRU
 GRUs
@@ -68,6 +62,7 @@ GreedySearchDecoder
 HVP
 Hugging Face
 IMDB
+IOT
 ImageNet
 Initializations
 Iteratively
@@ -79,26 +74,32 @@ Kubernetes
 Kuei
 LSTM
 LSTMs
+LeCun
 LeNet
 LeakyReLU
 LeakyReLUs
+Lipschitz
 Lua
 Luong
 MLP
 MLPs
 MNIST
+MacBook
 Mypy
 NAS
+NCCL
 NCHW
 NES
 NLP
 NTK
 NaN
+NanoGPT
 NeurIPS
 NumPy
 Numericalization
 Numpy's
 OpenAI
+PPO
 Plotly
 Prec
 Profiler
@@ -107,11 +108,16 @@ RGB
 RL
 RNN
 RNNs
+RPC
 RTX
 Radford
 ReLU
 ResNet
+SDPA
+SGD
+SPD
 SST2
+STN
 Sequentials
 Sigmoid
 SoTA
@@ -120,18 +126,27 @@ TensorBoard
 TextVQA
 Tokenization
 TorchMultimodal
+TorchRL
+TorchRL's
 TorchScript
 TorchX
 Tunable
 Unescape
 VQA
+VS Code
 Wikitext
 Xeon
 accuracies
 activations
 adversarially
+affine
 al
+allocator
+allocator's
+allocators
+approximators
 autodiff
+autoencoder
 autograd
 backend
 backends
@@ -145,9 +160,12 @@ batchnorm's
 benchmarking
 boolean
 broadcasted
+cardinality
 chatbot
 chatbot's
 checkpointing
+colorbar
+compilable
 composable
 concat
 config
@@ -165,11 +183,17 @@ datapipes
 dataset
 datasets
 dataset’s
+deallocation
+decorrelated
 deserialize
 deserialized
+deterministically
+dimensionality
 dir
 downsample
 downsamples
+dropdown
+duration
 embeddings
 encodings
 ensembling
@@ -199,6 +223,7 @@ imagenet
 initializations
 inlined
 interpretable
+invariance
 io
 iterable
 iteratively
@@ -206,9 +231,11 @@ jacobian
 jacobians
 jit
 jpg
+judgements
 kwargs
 labelled
 learnable
+learnings
 loadFilename
 manualSeed
 matplotlib
@@ -219,9 +246,11 @@ misclassification
 misclassified
 modularity
 modularized
+multihead
 multimodal
 multimodality
 multiobjective
+multiprocessed
 multithreaded
 namespace
 natively
@@ -237,6 +266,10 @@ optimizers
 overfitting
 parallelizable
 parallelization
+parametrization
+parametrizations
+parametrized
+parametrizing
 perceptibility
 pipelining
 pointwise
@@ -245,6 +278,7 @@ precomputing
 prepend
 preprocess
 preprocessing
+prepruned
 prespecified
 pretrained
 prewritten
@@ -258,6 +292,10 @@ queryable
 randint
 readably
 recomputation
+regressor
+reimplement
+reimplementing
+reimplements
 reinitializes
 relu
 reproducibility
@@ -265,6 +303,7 @@ rescale
 resnet
 restride
 rewinded
+rollout
 romanized
 runnable
 runtime
@@ -272,6 +311,8 @@ runtime
 runtimes
 scalable
 softmax
+sparsify
+specificities
 src
 stacktrace
 stateful
@@ -281,6 +322,8 @@ subclasses
 subclassing
 subdirectories
 submodule
+submodules
+subnetworks
 subreddit
 summarization
 tanh
@@ -291,6 +334,7 @@ timesteps
 tokenization
 tokenize
 tokenizer
+tooltip
 topologies
 torchaudio
 torchdata
@@ -307,7 +351,9 @@ uncommented
 unfused
 unimodal
 unnormalized
+unparametrized
 unpickling
+unpruned
 updation
 utils
 vectorization
diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py
index 690fa975a5c..0975ff93125 100644
--- a/intermediate_source/flask_rest_api_tutorial.py
+++ b/intermediate_source/flask_rest_api_tutorial.py
@@ -318,10 +318,10 @@ def get_prediction(image_bytes):
 #
 # .. code-block:: python
 #
-#   import requests
+#    import requests
 #
-#   resp = requests.post("http://localhost:5000/predict",
-#                        files={"file": open('<PATH/TO/.jpg/FILE>/cat.jpg','rb')})
+#    resp = requests.post("http://localhost:5000/predict",
+#                         files={"file": open('<PATH/TO/.jpg/FILE>/cat.jpg','rb')})
 
 #######################################################################
 # Printing `resp.json()` will now show the following:
diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
index 8dee73969db..dc6eca94931 100644
--- a/intermediate_source/reinforcement_ppo.py
+++ b/intermediate_source/reinforcement_ppo.py
@@ -15,7 +15,7 @@
 
 Key learnings:
 
-- How to create an environment in TorchRL, transform its outputs, and collect data from this env;
+- How to create an environment in TorchRL, transform its outputs, and collect data from this environment;
 - How to make your classes talk to each other using :class:`tensordict.TensorDict`;
 - The basics of building your training loop with TorchRL:
 
@@ -166,7 +166,7 @@
 # When using ``frame_skip`` it is good practice to
 # correct the other frame counts by the number of frames we are grouping
 # together. If we configure a total count of X frames for training but
-# use a ``frame_skip`` of Y, we will be actually collecting XY frames in total
+# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total
 # which exceeds our predefined budget.
 #
 frame_skip = 1
@@ -187,7 +187,7 @@
 # The size of these sub-batches is controlled by ``sub_batch_size``.
 #
 sub_batch_size = 64  # cardinality of the sub-samples gathered from the current data in the inner loop
-num_epochs = 10  # optimisation steps per batch of data collected
+num_epochs = 10  # optimization steps per batch of data collected
 clip_epsilon = (
     0.2  # clip value for PPO loss: see the equation in the intro for more context.
 )
@@ -201,9 +201,9 @@
 #
 # In RL, an *environment* is usually the way we refer to a simulator or a
 # control system. Various libraries provide simulation environments for reinforcement
-# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and
+# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and
 # many others.
-# As a generalistic library, TorchRL's goal is to provide an interchangeable interface
+# As a general library, TorchRL's goal is to provide an interchangeable interface
 # to a large panel of RL simulators, allowing you to easily swap one environment
 # with another. For example, creating a wrapped gym environment can be achieved with few characters:
 #
@@ -214,12 +214,12 @@
 # There are a few things to notice in this code: first, we created
 # the environment by calling the ``GymEnv`` wrapper. If extra keyword arguments
 # are passed, they will be transmitted to the ``gym.make`` method, hence covering
-# the most common env construction commands.
+# the most common environment construction commands.
 # Alternatively, one could also directly create a gym environment using ``gym.make(env_name, **kwargs)``
 # and wrap it in a `GymWrapper` class.
 #
 # Also the ``device`` argument: for gym, this only controls the device where
-# input action and observered states will be stored, but the execution will always
+# input action and observed states will be stored, but the execution will always
 # be done on CPU. The reason for this is simply that gym does not support on-device
 # execution, unless specified otherwise. For other libraries, we have control over
 # the execution device and, as much as we can, we try to stay consistent in terms of
@@ -232,8 +232,8 @@
 # the policy. In Gym, this is usually achieved via wrappers. TorchRL takes a different
 # approach, more similar to other pytorch domain libraries, through the use of transforms.
 # To add transforms to an environment, one should simply wrap it in a :class:`TransformedEnv`
-# instance, and append the sequence of transforms to it. The transformed env will inherit
-# the device and meta-data of the wrapped env, and transform these depending on the sequence
+# instance, and append the sequence of transforms to it. The transformed environment will inherit
+# the device and meta-data of the wrapped environment, and transform these depending on the sequence
 # of transforms it contains.
 #
 # Normalization
@@ -255,7 +255,7 @@
 # to communicate. You could think of it as a python dictionary with some extra
 # tensor features. In practice, this means that many modules we will be working
 # with need to be told what key to read (``in_keys``) and what key to write
-# (``out_keys``) in the tensordict they will receive. Usually, if ``out_keys``
+# (``out_keys``) in the ``tensordict`` they will receive. Usually, if ``out_keys``
 # is omitted, it is assumed that the ``in_keys`` entries will be updated
 # in-place. For our transforms, the only entry we are interested in is referred
 # to as ``"observation"`` and our transform layers will be told to modify this
@@ -295,7 +295,7 @@
 # environment specs, but you can easily check that your environment specs are
 # adequate.
 # In our example, the :class:`GymWrapper` and :class:`GymEnv` that inherits
-# from it already take care of setting the proper specs for your env so
+# from it already take care of setting the proper specs for your environment so
 # you should not have to care about this.
 #
 # Nevertheless, let's see a concrete example using our transformed
@@ -312,7 +312,7 @@
 print("action_spec (as defined by input_spec):", env.action_spec)
 
 ######################################################################
-# the :func:`check_env_specs` function runs a small rollout and compares its output against the environemnt
+# the :func:`check_env_specs` function runs a small rollout and compares its output against the environment
 # specs. If no error is raised, we can be confident that the specs are properly defined:
 #
 check_env_specs(env)
@@ -328,7 +328,7 @@
 # observation may be composite, meaning that it could be composed of more than one
 # tensor. This is not a problem for TorchRL, since the whole set of observations
 # is automatically packed in the output :class:`tensordict.TensorDict`. After executing a rollout
-# (ie a sequence of environment steps and random action generations) over a given
+# (for example, a sequence of environment steps and random action generations) over a given
 # number of steps, we will retrieve a :class:`tensordict.TensorDict` instance with a shape
 # that matches this trajectory length:
 #
@@ -340,7 +340,7 @@
 # Our rollout data has a shape of ``torch.Size([3])`, which matches the number of steps
 # we ran it for. The ``"next"`` entry points to the data coming after the current step.
 # In most cases, the ``"next""`` data at time `t` matches the data at ``t+1``, but this
-# may not be the case if we are using some specific transformations (e.g. mutli-step).
+# may not be the case if we are using some specific transformations (for example, multi-step).
 #
 # Policy
 # ------
@@ -360,13 +360,13 @@
 #     f_{\theta}(\text{observation}) = \mu_{\theta}(\text{observation}), \sigma^{+}_{\theta}(\text{observation})
 #
 # The only extra-difficulty that is brought up here is to split our output in two
-# equal parts and map the second to a scrictly positive space.
+# equal parts and map the second to a strictly positive space.
 #
 # We design the policy in three steps:
 #
 # 1. Define a neural network ``D_obs`` -> ``2 * D_action``. Indeed, our ``loc`` (mu) and ``scale`` (sigma) both have dimension ``D_action``;
 #
-# 2. Append a :class:`NormalParamExtractor` to extract a location and a scale (ie splits the input in two equal parts
+# 2. Append a :class:`NormalParamExtractor` to extract a location and a scale (for example, splits the input in two equal parts
 #   and applies a positive transformation to the scale parameter);
 #
 # 3. Create a probabilistic :class:`TensorDictModule` that can create this distribution and sample from it.
@@ -384,7 +384,7 @@
 )
 
 ######################################################################
-# To enable the policy to "talk" with the environment through the tensordict
+# To enable the policy to "talk" with the environment through the ``tensordict``
 # data carrier, we wrap the ``nn.Module`` in a :class:`TensorDictModule`. This
 # class will simply ready the ``in_keys`` it is provided with and write the
 # outputs in-place at the registered ``out_keys``.
@@ -429,7 +429,7 @@
 # won't be used at inference time. This module will read the observations and
 # return an estimation of the discounted return for the following trajectory.
 # This allows us to amortize learning by relying on the some utility estimation
-# that is learnt on-the-fly during training. Our value network share the same
+# that is learned on-the-fly during training. Our value network share the same
 # structure as the policy, but for simplicity we assign it its own set of
 # parameters.
 #
@@ -484,7 +484,7 @@
 # As for the policy and environment before, the data collector will return
 # :class:`tensordict.TensorDict` instances with a total number of elements that will
 # match ``frames_per_batch``. Using :class:`tensordict.TensorDict` to pass data to the
-# training loop allows you to write dataloading pipelines
+# training loop allows you to write data loading pipelines
 # that are 100% oblivious to the actual specificities of the rollout content.
 #
 collector = SyncDataCollector(
@@ -525,7 +525,7 @@
 # Loss function
 # -------------
 #
-# The PPO loss can be directly imported from torchrl for convenience using the
+# The PPO loss can be directly imported from TorchRL for convenience using the
 # :class:`ClipPPOLoss` class. This is the easiest way of utilizing PPO:
 # it hides away the mathematical operations of PPO and the control flow that
 # goes with it.
@@ -536,7 +536,7 @@
 # To compute the advantage, one just needs to (1) build the advantage module, which
 # utilizes our value operator, and (2) pass each batch of data through it before each
 # epoch.
-# The GAE module will update the input tensordict with new ``"advantage"`` and
+# The GAE module will update the input ``tensordict`` with new ``"advantage"`` and
 # ``"value_target"`` entries.
 # The ``"value_target"`` is a gradient-free tensor that represents the empirical
 # value that the value network should represent with the input observation.
@@ -612,7 +612,7 @@
                 + loss_vals["loss_entropy"]
             )
 
-            # Optimization: backward, grad clipping and optim step
+            # Optimization: backward, grad clipping and optimization step
             loss_value.backward()
             # this is not strictly mandatory but it's good practice to keep
             # your gradient norm bounded
@@ -633,8 +633,8 @@
         # We evaluate the policy once every 10 batches of data.
         # Evaluation is rather simple: execute the policy without exploration
         # (take the expected value of the action distribution) for a given
-        # number of steps (1000, which is our env horizon).
-        # The ``rollout`` method of the env can take a policy as argument:
+        # number of steps (1000, which is our ``env`` horizon).
+        # The ``rollout`` method of the ``env`` can take a policy as argument:
         # it will then execute this policy at each step.
         with set_exploration_mode("mean"), torch.no_grad():
             # execute a rollout with the trained policy
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 083ce07f77a..78dc7e2fc6e 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -17,9 +17,9 @@
 `Gymnasium's website <https://gymnasium.farama.org/environments/classic_control/cart_pole/>`__.
 
 .. figure:: /_static/img/cartpole.gif
-   :alt: cartpole
+   :alt: CartPole
 
-   cartpole
+   CartPole
 
 As the agent observes the current state of the environment and chooses
 an action, the environment *transitions* to a new state, and also
@@ -45,7 +45,7 @@
 `gymnasium <https://gymnasium.farama.org/>`__ for the environment,
 installed by using `pip`. This is a fork of the original OpenAI
 Gym project and maintained by the same team since Gym v0.19.
-If you are running this in Google colab, run:
+If you are running this in Google Colab, run:
 
 .. code-block:: bash
 
@@ -82,7 +82,7 @@
 
 plt.ion()
 
-# if gpu is to be used
+# if GPU is to be used
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
@@ -96,7 +96,7 @@
 # batch are decorrelated. It has been shown that this greatly stabilizes
 # and improves the DQN training procedure.
 #
-# For this, we're going to need two classses:
+# For this, we're going to need two classes:
 #
 # -  ``Transition`` - a named tuple representing a single transition in
 #    our environment. It essentially maps (state, action) pairs
@@ -172,7 +172,7 @@ def __len__(self):
 #
 # .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a))
 #
-# To minimise this error, we will use the `Huber
+# To minimize this error, we will use the `Huber
 # loss <https://en.wikipedia.org/wiki/Huber_loss>`__. The Huber loss acts
 # like the mean squared error when the error is small, but like the mean
 # absolute error when the error is large - this makes it more robust to
@@ -233,7 +233,7 @@ def forward(self, x):
 #    probability of choosing a random action will start at ``EPS_START``
 #    and will decay exponentially towards ``EPS_END``. ``EPS_DECAY``
 #    controls the rate of the decay.
-# -  ``plot_durations`` - a helper for plotting the durations of episodes,
+# -  ``plot_durations`` - a helper for plotting the duration of episodes,
 #    along with an average over the last 100 episodes (the measure used in
 #    the official evaluations). The plot will be underneath the cell
 #    containing the main training loop, and will update after every
@@ -246,7 +246,7 @@ def forward(self, x):
 # EPS_END is the final value of epsilon
 # EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
 # TAU is the update rate of the target network
-# LR is the learning rate of the AdamW optimizer
+# LR is the learning rate of the ``AdamW`` optimizer
 BATCH_SIZE = 128
 GAMMA = 0.99
 EPS_START = 0.9
@@ -391,7 +391,7 @@ def optimize_model():
 #
 # Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 
 # episodes are scheduled so training does not take too long. However, 50 
-# episodes is insufficient for to observe good performance on cartpole.
+# episodes is insufficient for to observe good performance on CartPole.
 # You should see the model constantly achieve 500 steps within 600 training 
 # episodes. Training RL agents can be a noisy process, so restarting training
 # can produce better results if convergence is not observed.
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
index fbc76a15799..669e516f2c2 100644
--- a/intermediate_source/scaled_dot_product_attention_tutorial.py
+++ b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -88,7 +88,7 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
 # Lets explore the speed of each of the 3 implementations
 from torch.backends.cuda import sdp_kernel, SDPBackend
 
-# Helpful arg mapper
+# Helpful arguments mapper
 backend_map = {
     SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False},
     SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False},
@@ -130,8 +130,8 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
 # ~~~~~~~~~~~~~~~~~~~~~
 #
 # Below is an example implementation of a multi-headed causal self
-# attention block inspired by Andrej Karpathy’s
-# `NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository.
+# attention block inspired by
+# `Andrej Karpathy NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository.
 #
 
 class CausalSelfAttention(nn.Module):
@@ -186,12 +186,12 @@ def forward(self, x):
 print(model)
 
 
-######################################################################
-# NestedTensor and Dense tensor support
-# -------------------------------------
+#####################################################################
+# ``NestedTensor`` and Dense tensor support
+# -----------------------------------------
 #
-# SDPA supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences
-# without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensors see
+# SDPA supports both ``NestedTensor`` and Dense tensor inputs. ``NestedTensors`` handle the case where the input is a batch of variable length sequences
+# without needing to pad each sequence to the maximum length in the batch. For more information about ``NestedTensors`` see
 # `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ and `NestedTensors Tutorial <https://pytorch.org/tutorials/prototype/nestedtensor.html>`__.
 #
 
@@ -236,7 +236,7 @@ def generate_rand_batch(
 random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device)
 random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device)
 
-# Currently the fused implementations don't support NestedTensor for training
+# Currently the fused implementations don't support ``NestedTensor`` for training
 model.eval()
 
 with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
@@ -248,14 +248,14 @@ def generate_rand_batch(
 
 
 ######################################################################
-# Using SDPA with torch.compile
-# ============================
+# Using SDPA with ``torch.compile``
+# =================================
 #
 # With the release of PyTorch 2.0, a new feature called
 # ``torch.compile()`` has been introduced, which can provide
 # significant performance improvements over eager mode.
 # Scaled dot product attention is fully composable with ``torch.compile()``.
-# To demonstrate this, let's compile the CausalSelfAttention module using
+# To demonstrate this, let's compile the ``CausalSelfAttention`` module using
 # ``torch.compile()`` and observe the resulting performance improvements.
 #
 
@@ -303,7 +303,9 @@ def generate_rand_batch(
 print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
 
 # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results
-# prof.export_chrome_trace("compiled_causal_attention_trace.json").
+# ::
+#
+#    prof.export_chrome_trace("compiled_causal_attention_trace.json").
 
 
 
@@ -315,15 +317,14 @@ def generate_rand_batch(
 # on the same set of functions for both modules.
 # The reason for this here is that ``torch.compile`` is very good at removing the
 # framework overhead associated with PyTorch. If your model is launching
-# large, efficient CUDA kernels, which in this case CausaulSelfAttention
+# large, efficient CUDA kernels, which in this case ``CausaulSelfAttention``
 # is, then the overhead of PyTorch can be hidden.
 #
 # In reality, your module does not normally consist of a singular
-# CausalSelfAttention block. When experimenting with Andrej Karpathy’s
-# `NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository, compiling
+# ``CausalSelfAttention`` block. When experimenting with `Andrej Karpathy NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository, compiling
 # the module took the time per train step from: ``6090.49ms`` to
-# ``3273.17ms``! This was done on commit: ae3a8d5 of NanoGPT training on
-# the shakespeare dataset.
+# ``3273.17ms``! This was done on commit: ``ae3a8d5`` of NanoGPT training on
+# the Shakespeare dataset.
 #
 
 
@@ -335,7 +336,7 @@ def generate_rand_batch(
 # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how
 # the ``sdp_kernel`` context manager can be used to assert a certain
 # implementation is used on GPU. As well, we built a simple
-# CausalSelfAttention module that works with NestedTensor and is torch
+# ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch
 # compilable. In the process we have shown how to the profiling tools can
 # be used to explore the performance characteristics of a user defined
 # module.
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
index e8a5651c57b..853cb2aed45 100644
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -106,7 +106,7 @@
 # yet, someone did the extra work of splitting language pairs into
 # individual text files here: https://www.manythings.org/anki/
 #
-# The English to French pairs are too big to include in the repo, so
+# The English to French pairs are too big to include in the repository, so
 # download to ``data/eng-fra.txt`` before continuing. The file is a tab
 # separated list of translation pairs:
 #
@@ -301,10 +301,10 @@ def prepareData(lang1, lang2, reverse=False):
 # length and order, which makes it ideal for translation between two
 # languages.
 #
-# Consider the sentence "Je ne suis pas le chat noir" → "I am not the
-# black cat". Most of the words in the input sentence have a direct
+# Consider the sentence ``Je ne suis pas le chat noir`` → ``I am not the
+# black cat``. Most of the words in the input sentence have a direct
 # translation in the output sentence, but are in slightly different
-# orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
+# orders, e.g. ``chat noir`` and ``black cat``. Because of the ``ne/pas``
 # construction there is also one more word in the input sentence. It would
 # be difficult to produce a correct translation directly from the sequence
 # of input words.
@@ -844,8 +844,8 @@ def evaluateAndShowAttention(input_sentence):
 #    -  Chat → Response
 #    -  Question → Answer
 #
-# -  Replace the embeddings with pre-trained word embeddings such as word2vec or
-#    GloVe
+# -  Replace the embeddings with pretrained word embeddings such as ``word2vec`` or
+#    ``GloVe``
 # -  Try with more layers, more hidden units, and more sentences. Compare
 #    the training time and results.
 # -  If you use a translation file where pairs have two of the same phrase
diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
index 7cd241d40ad..440f2257e1a 100644
--- a/intermediate_source/tensorboard_profiler_tutorial.py
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -54,7 +54,7 @@
 
 ######################################################################
 # Then prepare the input data. For this tutorial, we use the CIFAR10 dataset.
-# Transform it to the desired format and use DataLoader to load each batch.
+# Transform it to the desired format and use ``DataLoader`` to load each batch.
 
 transform = T.Compose(
     [T.Resize(224),
@@ -116,7 +116,7 @@ def train(data):
 # - ``profile_memory`` - Track tensor memory allocation/deallocation. Note, for old version of pytorch with version
 #   before 1.10, if you suffer long profiling time, please disable it or upgrade to new version.
 # - ``with_stack`` - Record source information (file and line number) for the ops.
-#   If the TensorBoard is launched in VSCode (`reference <https://code.visualstudio.com/docs/datascience/pytorch-support#_tensorboard-integration>`_),
+#   If the TensorBoard is launched in VS Code (`reference <https://code.visualstudio.com/docs/datascience/pytorch-support#_tensorboard-integration>`_),
 #   clicking a stack frame will navigate to the specific code line.
 
 with torch.profiler.profile(
@@ -217,13 +217,13 @@ def train(data):
 # The "Total" duration includes its child operators’ time.
 #
 # - View call stack
-# Click the "View Callstack" of an operator, the operators with same name but different call stacks will be shown.
-# Then click a "View Callstack" in this sub-table, the call stack frames will be shown.
+# Click the ``View Callstack`` of an operator, the operators with same name but different call stacks will be shown.
+# Then click a ``View Callstack`` in this sub-table, the call stack frames will be shown.
 #
 # .. image:: ../../_static/img/profiler_callstack.png
 #    :scale: 25 %
 #
-# If the TensorBoard is launched inside VSCode
+# If the TensorBoard is launched inside VS Code
 # (`Launch Guide <https://devblogs.microsoft.com/python/python-in-visual-studio-code-february-2021-release/#tensorboard-integration>`_),
 # clicking a call stack frame will navigate to the specific code line.
 #
@@ -279,8 +279,8 @@ def train(data):
 # 5. Improve performance with the help of profiler
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is DataLoader.
-# The PyTorch DataLoader uses single process by default.
+# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is ``DataLoader``.
+# The PyTorch ``DataLoader`` uses single process by default.
 # User could enable multi-process data loading by setting the parameter ``num_workers``.
 # `Here <https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading>`_ is more details.
 #
@@ -350,7 +350,7 @@ def train(data):
 # In the memory events table, the allocation and release events are paired into one entry. The "operator" column shows
 # the immediate ATen operator that is causing the allocation. Notice that in PyTorch, ATen operators commonly use
 # ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an
-# ``aten::fill_``. Solely display the opeartor name as ``aten::empty`` is of little help. It will be shown as
+# ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as
 # ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration"
 # columns' data might be missing if the event occurs outside of the time range. 
 #
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
index ad1c5d41be9..fcea4ed6611 100644
--- a/intermediate_source/torch_compile_tutorial.py
+++ b/intermediate_source/torch_compile_tutorial.py
@@ -41,7 +41,7 @@
 # Basic Usage
 # ------------
 #
-# ``torch.compile`` is included in the latest PyTorch nightlies.
+# ``torch.compile`` is included in the latest PyTorch..
 # Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly
 # binary. If Triton is still missing, try installing ``torchtriton`` via pip 
 # (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"``
@@ -125,7 +125,7 @@ def init_model():
 # First, let's compare inference.
 #
 # Note that in the call to ``torch.compile``, we have have the additional
-# ``mode`` kwarg, which we will discuss below.
+# ``mode`` argument, which we will discuss below.
 
 def evaluate(mod, inp):
     return mod(inp)
@@ -184,7 +184,7 @@ def evaluate(mod, inp):
 # GPU compute and the observed speedup may be less significant.
 #
 # You may also see different speedup results depending on the chosen ``mode``
-# kwarg. Since our model and data are small, we want to reduce overhead as
+# argument. Since our model and data are small, we want to reduce overhead as
 # much as possible, and so we chose ``"reduce-overhead"``. For your own models,
 # you may need to experiment with different modes to maximize speedup. You can
 # read more about modes `here <https://pytorch.org/get-started/pytorch-2.0/#user-experience>`__.

From 836a537b2a7eae38b0c272772838ed42ce99744e Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Apr 2023 15:30:40 -0700
Subject: [PATCH 4/7] Update

---
 .pyspelling.yml | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/.pyspelling.yml b/.pyspelling.yml
index bce94f383cc..ad1c666d89d 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -4,34 +4,6 @@ matrix:
   sources:
     - beginner_source/*.py
     - intermediate_source/*.py
-    #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py
-    #- intermediate_source/ax_multiobjective_nas_tutorial.py
-    #- intermediate_source/char_rnn_classification_tutorial.py
-    #- intermediate_source/char_rnn_generation_tutorial.py
-    #- intermediate_source/custom_function_conv_bn_tutorial.py
-    #- intermediate_source/ensembling.py
-    #- intermediate_source/flask_rest_api_tutorial.py
-    #- intermediate_source/forward_ad_usage.py
-    #- intermediate_source/fx_conv_bn_fuser.py
-    #- intermediate_source/fx_profiling_tutorial.py
-    #- intermediate_source/jacobians_hessians.py
-    #- intermediate_source/mario_rl_tutorial.py
-    #- intermediate_source/mnist_train_nas.py
-    #- intermediate_source/memory_format_tutorial.py
-    #- intermediate_source/model_parallel_tutorial.py
-    #- intermediate_source/neural_tangent_kernels.py
-    #- intermediate_source/nvfuser_intro_tutorial.py
-    #- intermediate_source/parametrizations.py
-    #- intermediate_source/per_sample_grads.py
-    #- intermediate_source/pipeline_tutorial.py
-    #- intermediate_source/pruning_tutorial.py
-    #- intermediate_source/reinforcement_ppo.py
-    #- intermediate_source/reinforcement_q_learning.py
-    #- intermediate_source/scaled_dot_product_attention_tutorial.py
-    #- intermediate_source/seq2seq_translation_tutorial.py
-    #- intermediate_source/spatial_transformer_tutorial.py
-    #- intermediate_source/tensorboard_profiler_tutorial.py
-    #- intermediate_source/torch_compile_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt

From 37b17f5c04166adbbb4a5bb17cd31a7a107a4fd0 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Apr 2023 15:31:48 -0700
Subject: [PATCH 5/7] Update

---
 .pyspelling.yml |  2 --
 en-wordlist.txt | 10 +++++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.pyspelling.yml b/.pyspelling.yml
index ad1c666d89d..ffe9f469d03 100644
--- a/.pyspelling.yml
+++ b/.pyspelling.yml
@@ -24,8 +24,6 @@ matrix:
         - open: '\.\. (raw)::.*$\n*'
           close: '\n'
         # Exclude
-        - open: '.*(:py:mod:).*'
-          close: ' '
         # Exclude Python coding directives
         - open: '-\*- coding:'
           close: '\n'
diff --git a/en-wordlist.txt b/en-wordlist.txt
index d8ddb614157..025098fd7ee 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -1,8 +1,3 @@
-UI
-bytecode
-TorchInductor
-unoptimized
-TorchDynamo
 APIs
 ATen
 Args
@@ -125,12 +120,15 @@ TPU
 TensorBoard
 TextVQA
 Tokenization
+TorchDynamo
+TorchInductor
 TorchMultimodal
 TorchRL
 TorchRL's
 TorchScript
 TorchX
 Tunable
+UI
 Unescape
 VQA
 VS Code
@@ -160,6 +158,7 @@ batchnorm's
 benchmarking
 boolean
 broadcasted
+bytecode
 cardinality
 chatbot
 chatbot's
@@ -351,6 +350,7 @@ uncommented
 unfused
 unimodal
 unnormalized
+unoptimized
 unparametrized
 unpickling
 unpruned

From a2d6d2650d368b9b8700801390937dc397633a0e Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Apr 2023 15:35:05 -0700
Subject: [PATCH 6/7] Update

---
 1 | 341 --------------------------------------------------------------
 1 file changed, 341 deletions(-)
 delete mode 100644 1

diff --git a/1 b/1
deleted file mode 100644
index 996a909cd88..00000000000
--- a/1
+++ /dev/null
@@ -1,341 +0,0 @@
-Andrej
-Karpathy's
-NanoGPT
-compilable
-decorrelated
-DQN
-deterministically
-approximators
-duration
-CartPole
-EPS
-APIs
-Args
-Autograd
-BCE
-BN
-BOS
-Bahdanau
-BatchNorm
-CHW
-CIFAR
-CLS
-CNNDM
-CNNs
-CPUs
-CUDA
-Cayley
-Chatbots
-Colab
-Conv
-ConvNet
-ConvNets
-DCGAN
-DCGANs
-DDQN
-DNN
-DataLoaders
-DeepMind
-DeiT
-DenseNet
-EOS
-FC
-FGSM
-FLAVA
-FX
-FX's
-FloydHub
-FloydHub's
-Frobenius
-GAE
-GAN
-GANs
-GPU's
-GPUs
-GRU
-GRUs
-GTC
-GeForce
-Goodfellow
-Goodfellow’s
-GreedySearchDecoder
-HVP
-Hugging Face
-IMDB
-ImageNet
-Initializations
-Iteratively
-JSON
-JVP
-Jacobian
-Kiuk
-Kubernetes
-Kuei
-LSTM
-LSTMs
-LeCun
-LeNet
-LeakyReLU
-LeakyReLUs
-Lipschitz
-Lua
-Luong
-MLP
-MLPs
-MNIST
-Mypy
-NAS
-NCHW
-NES
-NLP
-NTK
-NaN
-NeurIPS
-NumPy
-Numericalization
-Numpy's
-OpenAI
-PPO
-Plotly
-Prec
-Profiler
-PyTorch's
-RGB
-RL
-RNN
-RNNs
-RPC
-RTX
-Radford
-ReLU
-ResNet
-SPD
-SST2
-Sequentials
-Sigmoid
-SoTA
-TPU
-TensorBoard
-TextVQA
-Tokenization
-TorchMultimodal
-TorchRL
-TorchRL's
-TorchScript
-TorchX
-Tunable
-Unescape
-VQA
-Wikitext
-Xeon
-accuracies
-activations
-adversarially
-al
-autodiff
-autograd
-backend
-backends
-backprop
-backpropagate
-backpropagated
-backpropagates
-backpropagation
-batchnorm
-batchnorm's
-benchmarking
-boolean
-broadcasted
-cardinality
-chatbot
-chatbot's
-checkpointing
-composable
-concat
-config
-contrastive
-conv
-convolutional
-cpu
-csv
-cuDNN
-datafile
-dataframe
-dataloader
-dataloaders
-datapipes
-dataset
-datasets
-dataset’s
-deserialize
-deserialized
-dimensionality
-dir
-downsample
-downsamples
-embeddings
-encodings
-ensembling
-eq
-et
-evaluateInput
-extensibility
-fastai
-fbgemm
-feedforward
-finetune
-finetuning
-fp
-functorch
-fuser
-geomean
-grayscale
-hardcode
-helpdesk
-helpdesks
-hessian
-hessians
-hvp
-hyperparameter
-hyperparameters
-imagenet
-initializations
-inlined
-interpretable
-io
-iterable
-iteratively
-jacobian
-jacobians
-jit
-jpg
-kwargs
-labelled
-learnable
-learnings
-loadFilename
-manualSeed
-matplotlib
-minibatch
-minibatches
-minimax
-misclassification
-misclassified
-modularity
-modularized
-multihead
-multimodal
-multimodality
-multiobjective
-multiprocessed
-multithreaded
-namespace
-natively
-ndarrays
-num
-numericalize
-numpy
-nvFuser
-nvFuser's
-optimizable
-optimizer's
-optimizers
-overfitting
-parallelizable
-parallelization
-parametrization
-parametrizations
-parametrized
-parametrizing
-perceptibility
-pipelining
-pointwise
-precompute
-precomputing
-prepend
-preprocess
-preprocessing
-prepruned
-prespecified
-pretrained
-prewritten
-primals
-profiler
-profilers
-pytorch
-quantized
-quantizing
-queryable
-randint
-readably
-recomputation
-reimplement
-reimplementing
-reimplements
-reinitializes
-relu
-reproducibility
-rescale
-resnet
-restride
-rewinded
-rollout
-romanized
-runnable
-runtime
-runtime
-runtimes
-scalable
-softmax
-sparsify
-specificities
-src
-stacktrace
-stateful
-storages
-strided
-subclasses
-subclassing
-subdirectories
-submodule
-submodules
-subnetworks
-subreddit
-summarization
-tanh
-th
-thresholding
-timestep
-timesteps
-tokenization
-tokenize
-tokenizer
-topologies
-torchaudio
-torchdata
-torchscriptable
-torchtext
-torchtext's
-torchvision
-torchviz
-traceback
-tradeoff
-tradeoffs
-uncomment
-uncommented
-unfused
-unimodal
-unnormalized
-unparametrized
-unpickling
-unpruned
-updation
-utils
-vectorization
-vectorize
-vectorized
-vhp
-voc
-walkthrough
-warmstart
-warmstarting

From 16d01dc2505caeaba60c76fefa1e7d347c7ef2c6 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 20 Apr 2023 16:15:01 -0700
Subject: [PATCH 7/7] Fix

---
 intermediate_source/pipeline_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
index 444eddf8415..33561f60592 100644
--- a/intermediate_source/pipeline_tutorial.py
+++ b/intermediate_source/pipeline_tutorial.py
@@ -174,7 +174,7 @@ def data_process(raw_text_iter):
 
 def batchify(data, bsz):
     # Divide the dataset into ``bsz`` parts.
-    nbatch = data.size(0) // ``bsz``
+    nbatch = data.size(0) // bsz
     # Trim off any extra elements that wouldn't cleanly fit (remainders).
     data = data.narrow(0, 0, nbatch * bsz)
     # Evenly divide the data across the ``bsz` batches.