Pyspelling: intermediate Python tutorials N-Z

Svetlana Karslioglu · Svetlana Karslioglu · commit 3aaa3d24a696 · 2023-04-19T09:04:58.000-07:00
diff --git a/.pyspelling.yml b/.pyspelling.yml
@@ -2,22 +2,27 @@ spellchecker: aspell
 matrix:
 - name: python
   sources:
-    - beginner_source/*.py
-    - intermediate_source/autograd_saved_tensors_hooks_tutorial.py
-    - intermediate_source/ax_multiobjective_nas_tutorial.py
-    - intermediate_source/char_rnn_classification_tutorial.py
-    - intermediate_source/char_rnn_generation_tutorial.py
-    - intermediate_source/custom_function_conv_bn_tutorial.py
-    - intermediate_source/ensembling.py
+    #- beginner_source/*.py
+    #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py
+    #- intermediate_source/ax_multiobjective_nas_tutorial.py
+    #- intermediate_source/char_rnn_classification_tutorial.py
+    #- intermediate_source/char_rnn_generation_tutorial.py
+    #- intermediate_source/custom_function_conv_bn_tutorial.py
+    #- intermediate_source/ensembling.py
     #- intermediate_source/flask_rest_api_tutorial.py
-    - intermediate_source/forward_ad_usage.py
-    - intermediate_source/fx_conv_bn_fuser.py
-    - intermediate_source/fx_profiling_tutorial.py
-    - intermediate_source/jacobians_hessians.py
-    - intermediate_source/mario_rl_tutorial.py
-    - intermediate_source/mnist_train_nas.py
-    - intermediate_source/memory_format_tutorial.py
-    - intermediate_source/model_parallel_tutorial.py
+    #- intermediate_source/forward_ad_usage.py
+    #- intermediate_source/fx_conv_bn_fuser.py
+    #- intermediate_source/fx_profiling_tutorial.py
+    #- intermediate_source/jacobians_hessians.py
+    #- intermediate_source/mario_rl_tutorial.py
+    #- intermediate_source/mnist_train_nas.py
+    #- intermediate_source/memory_format_tutorial.py
+    #- intermediate_source/model_parallel_tutorial.py
+    #- intermediate_source/neural_tangent_kernels.py
+    #- intermediate_source/nvfuser_intro_tutorial.py
+    #- intermediate_source/parametrizations.py
+    #- intermediate_source/per_sample_grads.py
+    - intermediate_source/pipeline_tutorial.py
   dictionary:
     wordlists:
       - en-wordlist.txt
@@ -46,7 +51,7 @@ matrix:
         - open: '(?s)^::\n\n  '
           close: '^\n'
         # Ignore reStructuredText block directives
-        - open: '\.\. (code-block)::.*$\n*'
+        - open: '\.\. (code-block|math)::.*$\n*'
           content: '(?P<first>(^(?P<indent>[ ]+).*$\n))(?P<other>(^([ \t]+.*|[ \t]*)$\n)*)'
           close: '(^(?![ \t]+.*$))'
   - pyspelling.filters.markdown:
diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -1,3 +1,19 @@
+RPC
+multihead
+GPU's
+Lipschitz
+Frobenius
+reimplement
+reimplements
+reimplementing
+parametrizing
+unparametrized
+submodules
+SPD
+Cayley
+parametrization
+parametrized
+parametrizations
 APIs
 Args
 Autograd
@@ -38,6 +54,7 @@ GANs
 GPUs
 GRU
 GRUs
+GTC
 GeForce
 Goodfellow
 Goodfellow’s
@@ -69,6 +86,7 @@ NAS
 NCHW
 NES
 NLP
+NTK
 NaN
 NeurIPS
 NumPy
@@ -161,6 +179,7 @@ finetuning
 fp
 functorch
 fuser
+geomean
 grayscale
 hardcode
 helpdesk
@@ -204,6 +223,8 @@ ndarrays
 num
 numericalize
 numpy
+nvFuser
+nvFuser's
 optimizable
 optimizer's
 optimizers
@@ -213,6 +234,7 @@ parallelization
 perceptibility
 pipelining
 pointwise
+precompute
 precomputing
 prepend
 preprocess
@@ -229,6 +251,7 @@ quantizing
 queryable
 randint
 readably
+recomputation
 reinitializes
 relu
 reproducibility
@@ -262,6 +285,7 @@ timesteps
 tokenization
 tokenize
 tokenizer
+topologies
 torchaudio
 torchdata
 torchscriptable
@@ -278,6 +302,7 @@ unfused
 unimodal
 unnormalized
 unpickling
+updation
 utils
 vectorization
 vectorize
diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py
@@ -58,7 +58,7 @@ def forward(self, x):
 # we will need a function that accepts the parameters of the model and a single
 # input (as opposed to a batch of inputs!) and returns a single output.
 #
-# We'll use ``torch.func.functional_call``, which allows us to call an nn.Module
+# We'll use ``torch.func.functional_call``, which allows us to call an ``nn.Module``
 # using different parameters/buffers, to help accomplish the first step.
 #
 # Keep in mind that the model was originally written to accept a batch of input
@@ -200,21 +200,21 @@ def func_x2(params):
         output, vjp_fn = vjp(func_x1, params)
 
         def get_ntk_slice(vec):
-            # This computes vec @ J(x2).T
+            # This computes ``vec @ J(x2).T``
             # `vec` is some unit vector (a single slice of the Identity matrix)
             vjps = vjp_fn(vec)
-            # This computes J(X1) @ vjps
+            # This computes ``J(X1) @ vjps``
             _, jvps = jvp(func_x2, (params,), vjps)
             return jvps
 
         # Here's our identity matrix
         basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1)
         return vmap(get_ntk_slice)(basis)
 
-    # get_ntk(x1, x2) computes the NTK for a single data point x1, x2
-    # Since the x1, x2 inputs to empirical_ntk_ntk_vps are batched,
+    # ``get_ntk(x1, x2)`` computes the NTK for a single data point x1, x2
+    # Since the x1, x2 inputs to ``empirical_ntk_ntk_vps`` are batched,
     # we actually wish to compute the NTK between every pair of data points
-    # between {x1} and {x2}. That's what the vmaps here do.
+    # between {x1} and {x2}. That's what the ``vmaps`` here do.
     result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2)
 
     if compute == 'full':
diff --git a/intermediate_source/nvfuser_intro_tutorial.py b/intermediate_source/nvfuser_intro_tutorial.py
@@ -71,7 +71,7 @@
 # networks, so improving the speed of these operations can improve
 # overall network training speed. Future releases of nvFuser will
 # improve the performance of Linear Layers, but for now we will
-# specifically look at the Bias-Dropout-Add-LayerNorm section of this
+# specifically look at the ``Bias-Dropout-Add-LayerNorm`` section of this
 # Transformer Block.
 #
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_transformer_block.png
@@ -154,7 +154,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
         # Run model, forward and backward
         output = forward_func()
         output.backward(grad_output)
-        # delete gradiens to avoid profiling the gradient accumulation
+        # delete gradients to avoid profiling the gradient accumulation
         for p in parameters:
             p.grad = None
 
@@ -165,7 +165,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
         # Run model, forward and backward
         output = forward_func()
         output.backward(grad_output)
-        # delete gradiens to avoid profiling the gradient accumulation
+        # delete gradients to avoid profiling the gradient accumulation
         for p in parameters:
             p.grad = None
 
@@ -265,7 +265,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 # nvFuser took around 2.4s in total to compile these high speed
 # GPU functions.
 #
-# nvFuser’s capabilities extend well beyond this initial performance gain.
+# nvFuser's capabilities extend well beyond this initial performance gain.
 #
 
 ######################################################################
@@ -281,7 +281,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 # To use nvFuser on inputs that change shape from iteration, we
 # generate new input and output gradient tensors and make a few
 # different sizes. Since the last dimension is shared with the
-# parameters and cannot be changed dynamically in LayerNorm, we
+# parameters and cannot be changed dynamically in ``LayerNorm``, we
 # perturb the first two dimensions of the input and gradient tensors.
 #
 
@@ -390,16 +390,16 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""):
 #
 
 ######################################################################
-# Defining novel operations with nvFuser and FuncTorch
+# Defining novel operations with nvFuser and functorch
 # ----------------------------------------------------
 #
 # One of the primary benefits of nvFuser is the ability to define
 # novel operations composed of PyTorch “primitives” which are then
 # just-in-time compiled into efficient kernels.
 #
 # PyTorch has strong performance for any individual operation,
-# especially composite operations like LayerNorm. However, if
-# LayerNorm wasn’t already implemented in PyTorch as a composite
+# especially composite operations like ``LayerNorm``. However, if
+# ``LayerNorm`` wasn’t already implemented in PyTorch as a composite
 # operation, then you’d have to define it as a series of simpler
 # (primitive) operations. Let’s make such a definition and run it
 # without nvFuser.
@@ -488,7 +488,7 @@ def primitive_definition(
 #
 # However, the performance is still slower than the original eager
 # mode performance of the composite definition. TorchScript works well
-# when predefined composite operations are used, however TorchScript’s
+# when predefined composite operations are used, however TorchScript
 # application of Autograd saves all of the activations for each
 # operator in the fusion for re-use in the backwards pass. However,
 # this is not typically the optimal choice. Especially when chaining
@@ -499,7 +499,7 @@ def primitive_definition(
 # It’s possible to optimize away many of these unnecessary memory
 # accesses, but it requires building a connected forward and backward
 # graph which isn’t possible with TorchScript. The
-# `memory_efficient_fusion` pass in FuncTorch, however, is such an
+# ``memory_efficient_fusion`` pass in functorch, however, is such an
 # optimization pass. To use this pass, we have to redefine our
 # function to pull the constants inside (for now it’s easiest to make
 # non-tensor constants literals in the function definition):
@@ -527,11 +527,11 @@ def primitive_definition_for_memory_efficient_fusion(
 
 ######################################################################
 # Now, instead of passing our function to TorchScript, we will pass it
-# to FuncTorch’s optimization pass.
+# to functorch optimization pass.
 #
 
 
-# Optimize the model with FuncTorch tracing and the memory efficiency
+# Optimize the model with functorch tracing and the memory efficiency
 # optimization pass
 memory_efficient_primitive_definition = memory_efficient_fusion(
     primitive_definition_for_memory_efficient_fusion
@@ -550,22 +550,22 @@ def primitive_definition_for_memory_efficient_fusion(
 
 ######################################################################
 # This recovers even more speed, but it’s still not as fast as
-# TorchScripts original performance with the composite definition.
+# TorchScript original performance with the composite definition.
 # However, this is still faster than running this new definition
 # without nvFuser, and is still faster than the composite definition
 # without nvFuser.
 #
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_5.png
 #
-# .. note:: FuncTorch’s memory efficient pass is experimental and still
+# .. note:: The functorch memory efficient pass is experimental and still
 #           actively in development.
 #           Future versions of the API are expected to achieve performance
 #           closer to that of TorchScript with the composite definition.
 #
-# .. note:: FuncTorch’s memory efficient pass specializes on the shapes of
+# .. note:: The functorch memory efficient pass specializes on the shapes of
 #           the inputs to the function. If new inputs are provided with
 #           different shapes, then you need to construct a new function
-#           using `memory_efficient_fusion` and apply it to the new inputs.
+#           using ``memory_efficient_fusion`` and apply it to the new inputs.
 
 
 ######################################################################
@@ -577,10 +577,10 @@ def primitive_definition_for_memory_efficient_fusion(
 # an entirely new operation in PyTorch – which takes a lot of time and
 # knowledge of the lower-level PyTorch code as well as parallel
 # programming – or writing the operation in simpler PyTorch ops and
-# settling for poor performance. For example, let's replace LayerNorm
-# in our example with RMSNorm. Even though RMSNorm is a bit simpler
-# than LayerNorm, it doesn’t have an existing compound operation in
-# PyTorch. See the `Root Mean Square Layer Normalization <https://doi.org/10.48550/arXiv.1910.07467>`__ paper for more information about RMSNorm.
+# settling for poor performance. For example, let's replace ``LayerNorm``
+# in our example with ``RMSNorm``. Even though ``RMSNorm`` is a bit simpler
+# than ``LayerNorm``, it doesn’t have an existing compound operation in
+# PyTorch. See the `Root Mean Square Layer Normalization <https://doi.org/10.48550/arXiv.1910.07467>`__ paper for more information about ``RMSNorm``.
 # As before, we’ll define our new transformer block with
 # primitive PyTorch operations.
 #
@@ -608,7 +608,7 @@ def with_rms_norm(
 # As before, we’ll get a baseline by running PyTorch without nvFuser.
 #
 
-# Profile rms_norm
+# Profile ``rms_norm``
 func = functools.partial(
     with_rms_norm,
     input1,
@@ -625,7 +625,7 @@ def with_rms_norm(
 # With nvFuser through TorchScript.
 #
 
-# Profile scripted rms_norm
+# Profile scripted ``rms_norm``
 scripted_with_rms_norm = torch.jit.script(with_rms_norm)
 func = functools.partial(
     scripted_with_rms_norm,
@@ -656,7 +656,7 @@ def with_rms_norm_for_memory_efficient_fusion(
     return norm_output
 
 
-# Profile memory efficient rms_norm
+# Profile memory efficient ``rms_norm``
 memory_efficient_rms_norm = memory_efficient_fusion(
     with_rms_norm_for_memory_efficient_fusion
 )
@@ -666,12 +666,12 @@ def with_rms_norm_for_memory_efficient_fusion(
 ######################################################################
 # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_6.png
 #
-# Since RMSNorm is simpler than LayerNorm the performance of our new
+# Since ``RMSNorm`` is simpler than ``LayerNorm`` the performance of our new
 # transformer block is a little higher than the primitive definition
 # without nvFuser (354 iterations per second compared with 260
 # iterations per second). With TorchScript, the iterations per second
 # increases by 2.68x and 3.36x to 952 iterations per second and 1,191
-# iterations per second with TorchScript and FuncTorch’s memory
+# iterations per second with TorchScript and functorch memory
 # efficient optimization pass, respectively. The performance of this
 # new operation nearly matches the performance of the composite Layer
 # Norm definition with TorchScript.
diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py
@@ -19,7 +19,7 @@
 This approach proposes to decouple the learning of the parameters from the
 learning of their norms.  To do so, the parameter is divided by its
 `Frobenius norm <https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm>`_
-and a separate parameter encoding its norm is learnt.
+and a separate parameter encoding its norm is learned.
 A similar regularization was proposed for GANs under the name of
 "`spectral normalization <https://pytorch.org/docs/stable/generated/torch.nn.utils.spectral_norm.html>`_". This method
 controls the Lipschitz constant of the network by dividing its parameters by
@@ -84,7 +84,7 @@ def forward(self, x):
 # 2) It does not separate the layer and the parametrization.  If the parametrization were
 #    more difficult, we would have to rewrite its code for each layer that we want to use it
 #    in.
-# 3) It recomputes the parametrization everytime we use the layer. If we use the layer
+# 3) It recomputes the parametrization every time we use the layer. If we use the layer
 #    several times during the forward pass, (imagine the recurrent kernel of an RNN), it
 #    would compute the same ``A`` every time that the layer is called.
 #
@@ -258,8 +258,8 @@ def forward(self, X):
 print((torch.symeig(X).eigenvalues > 0.).all())  # X is positive definite
 
 ###############################################################################
-# Intializing parametrizations
-# ----------------------------
+# Initializing parametrizations
+# -----------------------------
 #
 # Parametrizations come with a mechanism to initialize them. If we implement a method
 # ``right_inverse`` with signature
@@ -327,7 +327,7 @@ def right_inverse(self, A):
 ###############################################################################
 # The name of this method comes from the fact that we would often expect
 # that ``forward(right_inverse(X)) == X``. This is a direct way of rewriting that
-# the forward afer the initalization with value ``X`` should return the value ``X``.
+# the forward after the initialization with value ``X`` should return the value ``X``.
 # This constraint is not strongly enforced in practice. In fact, at times, it might be of
 # interest to relax this relation. For example, consider the following implementation
 # of a randomized pruning method:
diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py