From 3aaa3d24a6960e81d1ef0e492bca263cb7a3d1df Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 18 Apr 2023 15:54:01 -0700 Subject: [PATCH 1/7] Pyspelling: intermediate Python tutorials N-Z --- .pyspelling.yml | 37 ++++++++------ en-wordlist.txt | 25 ++++++++++ intermediate_source/neural_tangent_kernels.py | 12 ++--- intermediate_source/nvfuser_intro_tutorial.py | 50 +++++++++---------- intermediate_source/parametrizations.py | 10 ++-- intermediate_source/per_sample_grads.py | 20 ++++---- intermediate_source/pipeline_tutorial.py | 14 +++--- 7 files changed, 99 insertions(+), 69 deletions(-) diff --git a/.pyspelling.yml b/.pyspelling.yml index 9dce7c8215a..598ce7698df 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -2,22 +2,27 @@ spellchecker: aspell matrix: - name: python sources: - - beginner_source/*.py - - intermediate_source/autograd_saved_tensors_hooks_tutorial.py - - intermediate_source/ax_multiobjective_nas_tutorial.py - - intermediate_source/char_rnn_classification_tutorial.py - - intermediate_source/char_rnn_generation_tutorial.py - - intermediate_source/custom_function_conv_bn_tutorial.py - - intermediate_source/ensembling.py + #- beginner_source/*.py + #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py + #- intermediate_source/ax_multiobjective_nas_tutorial.py + #- intermediate_source/char_rnn_classification_tutorial.py + #- intermediate_source/char_rnn_generation_tutorial.py + #- intermediate_source/custom_function_conv_bn_tutorial.py + #- intermediate_source/ensembling.py #- intermediate_source/flask_rest_api_tutorial.py - - intermediate_source/forward_ad_usage.py - - intermediate_source/fx_conv_bn_fuser.py - - intermediate_source/fx_profiling_tutorial.py - - intermediate_source/jacobians_hessians.py - - intermediate_source/mario_rl_tutorial.py - - intermediate_source/mnist_train_nas.py - - intermediate_source/memory_format_tutorial.py - - intermediate_source/model_parallel_tutorial.py + #- intermediate_source/forward_ad_usage.py + #- intermediate_source/fx_conv_bn_fuser.py + #- intermediate_source/fx_profiling_tutorial.py + #- intermediate_source/jacobians_hessians.py + #- intermediate_source/mario_rl_tutorial.py + #- intermediate_source/mnist_train_nas.py + #- intermediate_source/memory_format_tutorial.py + #- intermediate_source/model_parallel_tutorial.py + #- intermediate_source/neural_tangent_kernels.py + #- intermediate_source/nvfuser_intro_tutorial.py + #- intermediate_source/parametrizations.py + #- intermediate_source/per_sample_grads.py + - intermediate_source/pipeline_tutorial.py dictionary: wordlists: - en-wordlist.txt @@ -46,7 +51,7 @@ matrix: - open: '(?s)^::\n\n ' close: '^\n' # Ignore reStructuredText block directives - - open: '\.\. (code-block)::.*$\n*' + - open: '\.\. (code-block|math)::.*$\n*' content: '(?P(^(?P[ ]+).*$\n))(?P(^([ \t]+.*|[ \t]*)$\n)*)' close: '(^(?![ \t]+.*$))' - pyspelling.filters.markdown: diff --git a/en-wordlist.txt b/en-wordlist.txt index 9a4a99d1df2..2c9f5e270c8 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,3 +1,19 @@ +RPC +multihead +GPU's +Lipschitz +Frobenius +reimplement +reimplements +reimplementing +parametrizing +unparametrized +submodules +SPD +Cayley +parametrization +parametrized +parametrizations APIs Args Autograd @@ -38,6 +54,7 @@ GANs GPUs GRU GRUs +GTC GeForce Goodfellow Goodfellow’s @@ -69,6 +86,7 @@ NAS NCHW NES NLP +NTK NaN NeurIPS NumPy @@ -161,6 +179,7 @@ finetuning fp functorch fuser +geomean grayscale hardcode helpdesk @@ -204,6 +223,8 @@ ndarrays num numericalize numpy +nvFuser +nvFuser's optimizable optimizer's optimizers @@ -213,6 +234,7 @@ parallelization perceptibility pipelining pointwise +precompute precomputing prepend preprocess @@ -229,6 +251,7 @@ quantizing queryable randint readably +recomputation reinitializes relu reproducibility @@ -262,6 +285,7 @@ timesteps tokenization tokenize tokenizer +topologies torchaudio torchdata torchscriptable @@ -278,6 +302,7 @@ unfused unimodal unnormalized unpickling +updation utils vectorization vectorize diff --git a/intermediate_source/neural_tangent_kernels.py b/intermediate_source/neural_tangent_kernels.py index 5d897bfa31f..ca1de89daf1 100644 --- a/intermediate_source/neural_tangent_kernels.py +++ b/intermediate_source/neural_tangent_kernels.py @@ -58,7 +58,7 @@ def forward(self, x): # we will need a function that accepts the parameters of the model and a single # input (as opposed to a batch of inputs!) and returns a single output. # -# We'll use ``torch.func.functional_call``, which allows us to call an nn.Module +# We'll use ``torch.func.functional_call``, which allows us to call an ``nn.Module`` # using different parameters/buffers, to help accomplish the first step. # # Keep in mind that the model was originally written to accept a batch of input @@ -200,10 +200,10 @@ def func_x2(params): output, vjp_fn = vjp(func_x1, params) def get_ntk_slice(vec): - # This computes vec @ J(x2).T + # This computes ``vec @ J(x2).T`` # `vec` is some unit vector (a single slice of the Identity matrix) vjps = vjp_fn(vec) - # This computes J(X1) @ vjps + # This computes ``J(X1) @ vjps`` _, jvps = jvp(func_x2, (params,), vjps) return jvps @@ -211,10 +211,10 @@ def get_ntk_slice(vec): basis = torch.eye(output.numel(), dtype=output.dtype, device=output.device).view(output.numel(), -1) return vmap(get_ntk_slice)(basis) - # get_ntk(x1, x2) computes the NTK for a single data point x1, x2 - # Since the x1, x2 inputs to empirical_ntk_ntk_vps are batched, + # ``get_ntk(x1, x2)`` computes the NTK for a single data point x1, x2 + # Since the x1, x2 inputs to ``empirical_ntk_ntk_vps`` are batched, # we actually wish to compute the NTK between every pair of data points - # between {x1} and {x2}. That's what the vmaps here do. + # between {x1} and {x2}. That's what the ``vmaps`` here do. result = vmap(vmap(get_ntk, (None, 0)), (0, None))(x1, x2) if compute == 'full': diff --git a/intermediate_source/nvfuser_intro_tutorial.py b/intermediate_source/nvfuser_intro_tutorial.py index 91166fcce1e..155c1471a72 100644 --- a/intermediate_source/nvfuser_intro_tutorial.py +++ b/intermediate_source/nvfuser_intro_tutorial.py @@ -71,7 +71,7 @@ # networks, so improving the speed of these operations can improve # overall network training speed. Future releases of nvFuser will # improve the performance of Linear Layers, but for now we will -# specifically look at the Bias-Dropout-Add-LayerNorm section of this +# specifically look at the ``Bias-Dropout-Add-LayerNorm`` section of this # Transformer Block. # # .. figure:: /_static/img/nvfuser_intro/nvfuser_transformer_block.png @@ -154,7 +154,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # Run model, forward and backward output = forward_func() output.backward(grad_output) - # delete gradiens to avoid profiling the gradient accumulation + # delete gradients to avoid profiling the gradient accumulation for p in parameters: p.grad = None @@ -165,7 +165,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # Run model, forward and backward output = forward_func() output.backward(grad_output) - # delete gradiens to avoid profiling the gradient accumulation + # delete gradients to avoid profiling the gradient accumulation for p in parameters: p.grad = None @@ -265,7 +265,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # nvFuser took around 2.4s in total to compile these high speed # GPU functions. # -# nvFuser’s capabilities extend well beyond this initial performance gain. +# nvFuser's capabilities extend well beyond this initial performance gain. # ###################################################################### @@ -281,7 +281,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # To use nvFuser on inputs that change shape from iteration, we # generate new input and output gradient tensors and make a few # different sizes. Since the last dimension is shared with the -# parameters and cannot be changed dynamically in LayerNorm, we +# parameters and cannot be changed dynamically in ``LayerNorm``, we # perturb the first two dimensions of the input and gradient tensors. # @@ -390,7 +390,7 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # ###################################################################### -# Defining novel operations with nvFuser and FuncTorch +# Defining novel operations with nvFuser and functorch # ---------------------------------------------------- # # One of the primary benefits of nvFuser is the ability to define @@ -398,8 +398,8 @@ def profile_workload(forward_func, grad_output, iteration_count=100, label=""): # just-in-time compiled into efficient kernels. # # PyTorch has strong performance for any individual operation, -# especially composite operations like LayerNorm. However, if -# LayerNorm wasn’t already implemented in PyTorch as a composite +# especially composite operations like ``LayerNorm``. However, if +# ``LayerNorm`` wasn’t already implemented in PyTorch as a composite # operation, then you’d have to define it as a series of simpler # (primitive) operations. Let’s make such a definition and run it # without nvFuser. @@ -488,7 +488,7 @@ def primitive_definition( # # However, the performance is still slower than the original eager # mode performance of the composite definition. TorchScript works well -# when predefined composite operations are used, however TorchScript’s +# when predefined composite operations are used, however TorchScript # application of Autograd saves all of the activations for each # operator in the fusion for re-use in the backwards pass. However, # this is not typically the optimal choice. Especially when chaining @@ -499,7 +499,7 @@ def primitive_definition( # It’s possible to optimize away many of these unnecessary memory # accesses, but it requires building a connected forward and backward # graph which isn’t possible with TorchScript. The -# `memory_efficient_fusion` pass in FuncTorch, however, is such an +# ``memory_efficient_fusion`` pass in functorch, however, is such an # optimization pass. To use this pass, we have to redefine our # function to pull the constants inside (for now it’s easiest to make # non-tensor constants literals in the function definition): @@ -527,11 +527,11 @@ def primitive_definition_for_memory_efficient_fusion( ###################################################################### # Now, instead of passing our function to TorchScript, we will pass it -# to FuncTorch’s optimization pass. +# to functorch optimization pass. # -# Optimize the model with FuncTorch tracing and the memory efficiency +# Optimize the model with functorch tracing and the memory efficiency # optimization pass memory_efficient_primitive_definition = memory_efficient_fusion( primitive_definition_for_memory_efficient_fusion @@ -550,22 +550,22 @@ def primitive_definition_for_memory_efficient_fusion( ###################################################################### # This recovers even more speed, but it’s still not as fast as -# TorchScripts original performance with the composite definition. +# TorchScript original performance with the composite definition. # However, this is still faster than running this new definition # without nvFuser, and is still faster than the composite definition # without nvFuser. # # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_5.png # -# .. note:: FuncTorch’s memory efficient pass is experimental and still +# .. note:: The functorch memory efficient pass is experimental and still # actively in development. # Future versions of the API are expected to achieve performance # closer to that of TorchScript with the composite definition. # -# .. note:: FuncTorch’s memory efficient pass specializes on the shapes of +# .. note:: The functorch memory efficient pass specializes on the shapes of # the inputs to the function. If new inputs are provided with # different shapes, then you need to construct a new function -# using `memory_efficient_fusion` and apply it to the new inputs. +# using ``memory_efficient_fusion`` and apply it to the new inputs. ###################################################################### @@ -577,10 +577,10 @@ def primitive_definition_for_memory_efficient_fusion( # an entirely new operation in PyTorch – which takes a lot of time and # knowledge of the lower-level PyTorch code as well as parallel # programming – or writing the operation in simpler PyTorch ops and -# settling for poor performance. For example, let's replace LayerNorm -# in our example with RMSNorm. Even though RMSNorm is a bit simpler -# than LayerNorm, it doesn’t have an existing compound operation in -# PyTorch. See the `Root Mean Square Layer Normalization `__ paper for more information about RMSNorm. +# settling for poor performance. For example, let's replace ``LayerNorm`` +# in our example with ``RMSNorm``. Even though ``RMSNorm`` is a bit simpler +# than ``LayerNorm``, it doesn’t have an existing compound operation in +# PyTorch. See the `Root Mean Square Layer Normalization `__ paper for more information about ``RMSNorm``. # As before, we’ll define our new transformer block with # primitive PyTorch operations. # @@ -608,7 +608,7 @@ def with_rms_norm( # As before, we’ll get a baseline by running PyTorch without nvFuser. # -# Profile rms_norm +# Profile ``rms_norm`` func = functools.partial( with_rms_norm, input1, @@ -625,7 +625,7 @@ def with_rms_norm( # With nvFuser through TorchScript. # -# Profile scripted rms_norm +# Profile scripted ``rms_norm`` scripted_with_rms_norm = torch.jit.script(with_rms_norm) func = functools.partial( scripted_with_rms_norm, @@ -656,7 +656,7 @@ def with_rms_norm_for_memory_efficient_fusion( return norm_output -# Profile memory efficient rms_norm +# Profile memory efficient ``rms_norm`` memory_efficient_rms_norm = memory_efficient_fusion( with_rms_norm_for_memory_efficient_fusion ) @@ -666,12 +666,12 @@ def with_rms_norm_for_memory_efficient_fusion( ###################################################################### # .. figure:: /_static/img/nvfuser_intro/nvfuser_tutorial_6.png # -# Since RMSNorm is simpler than LayerNorm the performance of our new +# Since ``RMSNorm`` is simpler than ``LayerNorm`` the performance of our new # transformer block is a little higher than the primitive definition # without nvFuser (354 iterations per second compared with 260 # iterations per second). With TorchScript, the iterations per second # increases by 2.68x and 3.36x to 952 iterations per second and 1,191 -# iterations per second with TorchScript and FuncTorch’s memory +# iterations per second with TorchScript and functorch memory # efficient optimization pass, respectively. The performance of this # new operation nearly matches the performance of the composite Layer # Norm definition with TorchScript. diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py index 0f71a0aafe6..086a4300674 100644 --- a/intermediate_source/parametrizations.py +++ b/intermediate_source/parametrizations.py @@ -19,7 +19,7 @@ This approach proposes to decouple the learning of the parameters from the learning of their norms. To do so, the parameter is divided by its `Frobenius norm `_ -and a separate parameter encoding its norm is learnt. +and a separate parameter encoding its norm is learned. A similar regularization was proposed for GANs under the name of "`spectral normalization `_". This method controls the Lipschitz constant of the network by dividing its parameters by @@ -84,7 +84,7 @@ def forward(self, x): # 2) It does not separate the layer and the parametrization. If the parametrization were # more difficult, we would have to rewrite its code for each layer that we want to use it # in. -# 3) It recomputes the parametrization everytime we use the layer. If we use the layer +# 3) It recomputes the parametrization every time we use the layer. If we use the layer # several times during the forward pass, (imagine the recurrent kernel of an RNN), it # would compute the same ``A`` every time that the layer is called. # @@ -258,8 +258,8 @@ def forward(self, X): print((torch.symeig(X).eigenvalues > 0.).all()) # X is positive definite ############################################################################### -# Intializing parametrizations -# ---------------------------- +# Initializing parametrizations +# ----------------------------- # # Parametrizations come with a mechanism to initialize them. If we implement a method # ``right_inverse`` with signature @@ -327,7 +327,7 @@ def right_inverse(self, A): ############################################################################### # The name of this method comes from the fact that we would often expect # that ``forward(right_inverse(X)) == X``. This is a direct way of rewriting that -# the forward afer the initalization with value ``X`` should return the value ``X``. +# the forward after the initialization with value ``X`` should return the value ``X``. # This constraint is not strongly enforced in practice. In fact, at times, it might be of # interest to relax this relation. For example, consider the following implementation # of a randomized pruning method: diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py index 9d2c774e9fc..c423679229c 100644 --- a/intermediate_source/per_sample_grads.py +++ b/intermediate_source/per_sample_grads.py @@ -70,7 +70,7 @@ def loss_fn(predictions, targets): predictions = model(data) # move the entire mini-batch through the model loss = loss_fn(predictions, targets) -loss.backward() # back propogate the 'average' gradient of this mini-batch +loss.backward() # back propagate the 'average' gradient of this mini-batch ###################################################################### # In contrast to the above approach, per-sample-gradient computation is @@ -114,7 +114,7 @@ def compute_sample_grads(data, targets): # Our strategy is to define a function that computes the loss and then apply # transforms to construct a function that computes per-sample-gradients. # -# We'll use the ``torch.func.functional_call`` function to treat an nn.Module +# We'll use the ``torch.func.functional_call`` function to treat an ``nn.Module`` # like a function. # # First, let’s extract the state from ``model`` into two dictionaries, @@ -146,16 +146,16 @@ def compute_loss(params, buffers, sample, target): ###################################################################### # Now, let’s use the ``grad`` transform to create a new function that computes # the gradient with respect to the first argument of ``compute_loss`` -# (i.e. the params). +# (i.e. the ``params``). ft_compute_grad = grad(compute_loss) ###################################################################### # The ``ft_compute_grad`` function computes the gradient for a single -# (sample, target) pair. We can use vmap to get it to compute the gradient +# (sample, target) pair. We can use ``vmap`` to get it to compute the gradient # over an entire batch of samples and targets. Note that # ``in_dims=(None, None, 0, 0)`` because we wish to map ``ft_compute_grad`` over -# the 0th dimension of the data and targets, and use the same params and +# the 0th dimension of the data and targets, and use the same ``params`` and # buffers for each. ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0)) @@ -174,16 +174,16 @@ def compute_loss(params, buffers, sample, target): ###################################################################### # A quick note: there are limitations around what types of functions can be -# transformed by vmap. The best functions to transform are ones that are pure +# transformed by ``vmap``. The best functions to transform are ones that are pure # functions: a function where the outputs are only determined by the inputs, -# and that have no side effects (e.g. mutation). vmap is unable to handle +# and that have no side effects (e.g. mutation). ``vmap`` is unable to handle # mutation of arbitrary Python data structures, but it is able to handle many # in-place PyTorch operations. # # Performance comparison # ---------------------- # -# Curious about how the performance of vmap compares? +# Curious about how the performance of ``vmap`` compares? # # Currently the best results are obtained on newer GPU's such as the A100 # (Ampere) where we've seen up to 25x speedups on this example, but here are @@ -218,9 +218,9 @@ def get_perf(first, first_descriptor, second, second_descriptor): # the naive method. But it’s cool that composing ``vmap`` and ``grad`` give us a # nice speedup. # -# In general, vectorization with vmap should be faster than running a function +# In general, vectorization with ``vmap`` should be faster than running a function # in a for-loop and competitive with manual batching. There are some exceptions -# though, like if we haven’t implemented the vmap rule for a particular +# though, like if we haven’t implemented the ``vmap`` rule for a particular # operation or if the underlying kernels weren’t optimized for older hardware # (GPUs). If you see any of these cases, please let us know by opening an issue # at on GitHub. diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py index bdd6cabb3f2..398d3cdf28a 100644 --- a/intermediate_source/pipeline_tutorial.py +++ b/intermediate_source/pipeline_tutorial.py @@ -35,7 +35,7 @@ # As a result, our focus is on ``nn.TransformerEncoder`` and we split the model # such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the # other half are on another. To do this, we pull out the ``Encoder`` and -# ``Decoder`` sections into seperate modules and then build an nn.Sequential +# ``Decoder`` sections into seperate modules and then build an ``nn.Sequential`` # representing the original Transformer module. import sys @@ -172,11 +172,11 @@ def data_process(raw_text_iter): device = torch.device("cuda") def batchify(data, bsz): - # Divide the dataset into bsz parts. - nbatch = data.size(0) // bsz + # Divide the dataset into ``bsz`` parts. + nbatch = data.size(0) // ``bsz`` # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the bsz batches. + # Evenly divide the data across the ``bsz` batches. data = data.view(bsz, -1).t().contiguous() return data.to(device) @@ -245,9 +245,9 @@ def get_batch(source, i): ntokens = len(vocab) # the size of vocabulary emsize = 4096 # embedding dimension -nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder -nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder -nhead = 16 # the number of heads in the multiheadattention models +nhid = 4096 # the dimension of the feedforward network model in ``nn.TransformerEncoder`` +nlayers = 12 # the number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder`` +nhead = 16 # the number of heads in the Multihead Attention models dropout = 0.2 # the dropout value from torch.distributed import rpc From 675e9e49ee1397e2f7af9523174b9b1d7c8a3d60 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Wed, 19 Apr 2023 09:01:55 -0700 Subject: [PATCH 2/7] Update --- .pyspelling.yml | 3 ++- en-wordlist.txt | 6 ++++++ intermediate_source/pipeline_tutorial.py | 23 ++++++++++++----------- intermediate_source/pruning_tutorial.py | 4 ++-- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.pyspelling.yml b/.pyspelling.yml index 598ce7698df..570785eee7e 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -22,7 +22,8 @@ matrix: #- intermediate_source/nvfuser_intro_tutorial.py #- intermediate_source/parametrizations.py #- intermediate_source/per_sample_grads.py - - intermediate_source/pipeline_tutorial.py + #- intermediate_source/pipeline_tutorial.py + - intermediate_source/pruning_tutorial.py dictionary: wordlists: - en-wordlist.txt diff --git a/en-wordlist.txt b/en-wordlist.txt index 2c9f5e270c8..9e88bc1f7bf 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,3 +1,9 @@ +subnetworks +sparsify +LeCun +prepruned +dimensionality +unpruned RPC multihead GPU's diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py index 398d3cdf28a..444eddf8415 100644 --- a/intermediate_source/pipeline_tutorial.py +++ b/intermediate_source/pipeline_tutorial.py @@ -35,7 +35,7 @@ # As a result, our focus is on ``nn.TransformerEncoder`` and we split the model # such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the # other half are on another. To do this, we pull out the ``Encoder`` and -# ``Decoder`` sections into seperate modules and then build an ``nn.Sequential`` +# ``Decoder`` sections into separate modules and then build an ``nn.Sequential`` # representing the original Transformer module. import sys @@ -134,16 +134,17 @@ def forward(self, x): # length 6: # # .. math:: -# \begin{bmatrix} -# \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} -# \end{bmatrix} -# \Rightarrow -# \begin{bmatrix} -# \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & -# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & -# \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & -# \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} -# \end{bmatrix} +# +# \begin{bmatrix} +# \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} +# \end{bmatrix} +# \Rightarrow +# \begin{bmatrix} +# \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & +# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & +# \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & +# \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} +# \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more diff --git a/intermediate_source/pruning_tutorial.py b/intermediate_source/pruning_tutorial.py index d8de5a7502a..ba6701c8c35 100644 --- a/intermediate_source/pruning_tutorial.py +++ b/intermediate_source/pruning_tutorial.py @@ -339,8 +339,8 @@ def forward(self, x): # pruning this technique implements (supported options are ``global``, # ``structured``, and ``unstructured``). This is needed to determine # how to combine masks in the case in which pruning is applied -# iteratively. In other words, when pruning a pre-pruned parameter, -# the current prunining techique is expected to act on the unpruned +# iteratively. In other words, when pruning a prepruned parameter, +# the current pruning technique is expected to act on the unpruned # portion of the parameter. Specifying the ``PRUNING_TYPE`` will # enable the ``PruningContainer`` (which handles the iterative # application of pruning masks) to correctly identify the slice of the From 38a94d58274425dbb202195afdf6c89cc35b2b65 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 20 Apr 2023 15:29:26 -0700 Subject: [PATCH 3/7] Update --- .pyspelling.yml | 19 +- 1 | 341 ++++++++++++++++++ en-wordlist.txt | 90 +++-- .../flask_rest_api_tutorial.py | 6 +- intermediate_source/reinforcement_ppo.py | 48 +-- .../reinforcement_q_learning.py | 18 +- .../scaled_dot_product_attention_tutorial.py | 39 +- .../seq2seq_translation_tutorial.py | 12 +- .../tensorboard_profiler_tutorial.py | 16 +- intermediate_source/torch_compile_tutorial.py | 6 +- 10 files changed, 499 insertions(+), 96 deletions(-) create mode 100644 1 diff --git a/.pyspelling.yml b/.pyspelling.yml index 570785eee7e..bce94f383cc 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -2,7 +2,8 @@ spellchecker: aspell matrix: - name: python sources: - #- beginner_source/*.py + - beginner_source/*.py + - intermediate_source/*.py #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py #- intermediate_source/ax_multiobjective_nas_tutorial.py #- intermediate_source/char_rnn_classification_tutorial.py @@ -23,7 +24,14 @@ matrix: #- intermediate_source/parametrizations.py #- intermediate_source/per_sample_grads.py #- intermediate_source/pipeline_tutorial.py - - intermediate_source/pruning_tutorial.py + #- intermediate_source/pruning_tutorial.py + #- intermediate_source/reinforcement_ppo.py + #- intermediate_source/reinforcement_q_learning.py + #- intermediate_source/scaled_dot_product_attention_tutorial.py + #- intermediate_source/seq2seq_translation_tutorial.py + #- intermediate_source/spatial_transformer_tutorial.py + #- intermediate_source/tensorboard_profiler_tutorial.py + #- intermediate_source/torch_compile_tutorial.py dictionary: wordlists: - en-wordlist.txt @@ -36,9 +44,16 @@ matrix: # Exclude figure rST tags - open: '\.\.\s+(figure|literalinclude|math|image|grid)::' close: '\n' + # Exclude roles: + - open: ':(?:(class|py:mod|mod|func)):`' + content: '[^`]*' + close: '`' # Exclude raw directive - open: '\.\. (raw)::.*$\n*' close: '\n' + # Exclude + - open: '.*(:py:mod:).*' + close: ' ' # Exclude Python coding directives - open: '-\*- coding:' close: '\n' diff --git a/1 b/1 new file mode 100644 index 00000000000..996a909cd88 --- /dev/null +++ b/1 @@ -0,0 +1,341 @@ +Andrej +Karpathy's +NanoGPT +compilable +decorrelated +DQN +deterministically +approximators +duration +CartPole +EPS +APIs +Args +Autograd +BCE +BN +BOS +Bahdanau +BatchNorm +CHW +CIFAR +CLS +CNNDM +CNNs +CPUs +CUDA +Cayley +Chatbots +Colab +Conv +ConvNet +ConvNets +DCGAN +DCGANs +DDQN +DNN +DataLoaders +DeepMind +DeiT +DenseNet +EOS +FC +FGSM +FLAVA +FX +FX's +FloydHub +FloydHub's +Frobenius +GAE +GAN +GANs +GPU's +GPUs +GRU +GRUs +GTC +GeForce +Goodfellow +Goodfellow’s +GreedySearchDecoder +HVP +Hugging Face +IMDB +ImageNet +Initializations +Iteratively +JSON +JVP +Jacobian +Kiuk +Kubernetes +Kuei +LSTM +LSTMs +LeCun +LeNet +LeakyReLU +LeakyReLUs +Lipschitz +Lua +Luong +MLP +MLPs +MNIST +Mypy +NAS +NCHW +NES +NLP +NTK +NaN +NeurIPS +NumPy +Numericalization +Numpy's +OpenAI +PPO +Plotly +Prec +Profiler +PyTorch's +RGB +RL +RNN +RNNs +RPC +RTX +Radford +ReLU +ResNet +SPD +SST2 +Sequentials +Sigmoid +SoTA +TPU +TensorBoard +TextVQA +Tokenization +TorchMultimodal +TorchRL +TorchRL's +TorchScript +TorchX +Tunable +Unescape +VQA +Wikitext +Xeon +accuracies +activations +adversarially +al +autodiff +autograd +backend +backends +backprop +backpropagate +backpropagated +backpropagates +backpropagation +batchnorm +batchnorm's +benchmarking +boolean +broadcasted +cardinality +chatbot +chatbot's +checkpointing +composable +concat +config +contrastive +conv +convolutional +cpu +csv +cuDNN +datafile +dataframe +dataloader +dataloaders +datapipes +dataset +datasets +dataset’s +deserialize +deserialized +dimensionality +dir +downsample +downsamples +embeddings +encodings +ensembling +eq +et +evaluateInput +extensibility +fastai +fbgemm +feedforward +finetune +finetuning +fp +functorch +fuser +geomean +grayscale +hardcode +helpdesk +helpdesks +hessian +hessians +hvp +hyperparameter +hyperparameters +imagenet +initializations +inlined +interpretable +io +iterable +iteratively +jacobian +jacobians +jit +jpg +kwargs +labelled +learnable +learnings +loadFilename +manualSeed +matplotlib +minibatch +minibatches +minimax +misclassification +misclassified +modularity +modularized +multihead +multimodal +multimodality +multiobjective +multiprocessed +multithreaded +namespace +natively +ndarrays +num +numericalize +numpy +nvFuser +nvFuser's +optimizable +optimizer's +optimizers +overfitting +parallelizable +parallelization +parametrization +parametrizations +parametrized +parametrizing +perceptibility +pipelining +pointwise +precompute +precomputing +prepend +preprocess +preprocessing +prepruned +prespecified +pretrained +prewritten +primals +profiler +profilers +pytorch +quantized +quantizing +queryable +randint +readably +recomputation +reimplement +reimplementing +reimplements +reinitializes +relu +reproducibility +rescale +resnet +restride +rewinded +rollout +romanized +runnable +runtime +runtime +runtimes +scalable +softmax +sparsify +specificities +src +stacktrace +stateful +storages +strided +subclasses +subclassing +subdirectories +submodule +submodules +subnetworks +subreddit +summarization +tanh +th +thresholding +timestep +timesteps +tokenization +tokenize +tokenizer +topologies +torchaudio +torchdata +torchscriptable +torchtext +torchtext's +torchvision +torchviz +traceback +tradeoff +tradeoffs +uncomment +uncommented +unfused +unimodal +unnormalized +unparametrized +unpickling +unpruned +updation +utils +vectorization +vectorize +vectorized +vhp +voc +walkthrough +warmstart +warmstarting diff --git a/en-wordlist.txt b/en-wordlist.txt index 9e88bc1f7bf..d8ddb614157 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,26 +1,10 @@ -subnetworks -sparsify -LeCun -prepruned -dimensionality -unpruned -RPC -multihead -GPU's -Lipschitz -Frobenius -reimplement -reimplements -reimplementing -parametrizing -unparametrized -submodules -SPD -Cayley -parametrization -parametrized -parametrizations +UI +bytecode +TorchInductor +unoptimized +TorchDynamo APIs +ATen Args Autograd BCE @@ -35,6 +19,8 @@ CNNDM CNNs CPUs CUDA +CartPole +Cayley Chatbots Colab Conv @@ -42,12 +28,16 @@ ConvNet ConvNets DCGAN DCGANs +DDP DDQN DNN +DQN DataLoaders +DeepMind DeiT DenseNet EOS +EPS FC FGSM FLAVA @@ -55,8 +45,12 @@ FX FX's FloydHub FloydHub's +Frobenius +GAE GAN GANs +GLOO +GPU's GPUs GRU GRUs @@ -68,6 +62,7 @@ GreedySearchDecoder HVP Hugging Face IMDB +IOT ImageNet Initializations Iteratively @@ -79,26 +74,32 @@ Kubernetes Kuei LSTM LSTMs +LeCun LeNet LeakyReLU LeakyReLUs +Lipschitz Lua Luong MLP MLPs MNIST +MacBook Mypy NAS +NCCL NCHW NES NLP NTK NaN +NanoGPT NeurIPS NumPy Numericalization Numpy's OpenAI +PPO Plotly Prec Profiler @@ -107,11 +108,16 @@ RGB RL RNN RNNs +RPC RTX Radford ReLU ResNet +SDPA +SGD +SPD SST2 +STN Sequentials Sigmoid SoTA @@ -120,18 +126,27 @@ TensorBoard TextVQA Tokenization TorchMultimodal +TorchRL +TorchRL's TorchScript TorchX Tunable Unescape VQA +VS Code Wikitext Xeon accuracies activations adversarially +affine al +allocator +allocator's +allocators +approximators autodiff +autoencoder autograd backend backends @@ -145,9 +160,12 @@ batchnorm's benchmarking boolean broadcasted +cardinality chatbot chatbot's checkpointing +colorbar +compilable composable concat config @@ -165,11 +183,17 @@ datapipes dataset datasets dataset’s +deallocation +decorrelated deserialize deserialized +deterministically +dimensionality dir downsample downsamples +dropdown +duration embeddings encodings ensembling @@ -199,6 +223,7 @@ imagenet initializations inlined interpretable +invariance io iterable iteratively @@ -206,9 +231,11 @@ jacobian jacobians jit jpg +judgements kwargs labelled learnable +learnings loadFilename manualSeed matplotlib @@ -219,9 +246,11 @@ misclassification misclassified modularity modularized +multihead multimodal multimodality multiobjective +multiprocessed multithreaded namespace natively @@ -237,6 +266,10 @@ optimizers overfitting parallelizable parallelization +parametrization +parametrizations +parametrized +parametrizing perceptibility pipelining pointwise @@ -245,6 +278,7 @@ precomputing prepend preprocess preprocessing +prepruned prespecified pretrained prewritten @@ -258,6 +292,10 @@ queryable randint readably recomputation +regressor +reimplement +reimplementing +reimplements reinitializes relu reproducibility @@ -265,6 +303,7 @@ rescale resnet restride rewinded +rollout romanized runnable runtime @@ -272,6 +311,8 @@ runtime runtimes scalable softmax +sparsify +specificities src stacktrace stateful @@ -281,6 +322,8 @@ subclasses subclassing subdirectories submodule +submodules +subnetworks subreddit summarization tanh @@ -291,6 +334,7 @@ timesteps tokenization tokenize tokenizer +tooltip topologies torchaudio torchdata @@ -307,7 +351,9 @@ uncommented unfused unimodal unnormalized +unparametrized unpickling +unpruned updation utils vectorization diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py index 690fa975a5c..0975ff93125 100644 --- a/intermediate_source/flask_rest_api_tutorial.py +++ b/intermediate_source/flask_rest_api_tutorial.py @@ -318,10 +318,10 @@ def get_prediction(image_bytes): # # .. code-block:: python # -# import requests +# import requests # -# resp = requests.post("http://localhost:5000/predict", -# files={"file": open('/cat.jpg','rb')}) +# resp = requests.post("http://localhost:5000/predict", +# files={"file": open('/cat.jpg','rb')}) ####################################################################### # Printing `resp.json()` will now show the following: diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py index 8dee73969db..dc6eca94931 100644 --- a/intermediate_source/reinforcement_ppo.py +++ b/intermediate_source/reinforcement_ppo.py @@ -15,7 +15,7 @@ Key learnings: -- How to create an environment in TorchRL, transform its outputs, and collect data from this env; +- How to create an environment in TorchRL, transform its outputs, and collect data from this environment; - How to make your classes talk to each other using :class:`tensordict.TensorDict`; - The basics of building your training loop with TorchRL: @@ -166,7 +166,7 @@ # When using ``frame_skip`` it is good practice to # correct the other frame counts by the number of frames we are grouping # together. If we configure a total count of X frames for training but -# use a ``frame_skip`` of Y, we will be actually collecting XY frames in total +# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total # which exceeds our predefined budget. # frame_skip = 1 @@ -187,7 +187,7 @@ # The size of these sub-batches is controlled by ``sub_batch_size``. # sub_batch_size = 64 # cardinality of the sub-samples gathered from the current data in the inner loop -num_epochs = 10 # optimisation steps per batch of data collected +num_epochs = 10 # optimization steps per batch of data collected clip_epsilon = ( 0.2 # clip value for PPO loss: see the equation in the intro for more context. ) @@ -201,9 +201,9 @@ # # In RL, an *environment* is usually the way we refer to a simulator or a # control system. Various libraries provide simulation environments for reinforcement -# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and +# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and # many others. -# As a generalistic library, TorchRL's goal is to provide an interchangeable interface +# As a general library, TorchRL's goal is to provide an interchangeable interface # to a large panel of RL simulators, allowing you to easily swap one environment # with another. For example, creating a wrapped gym environment can be achieved with few characters: # @@ -214,12 +214,12 @@ # There are a few things to notice in this code: first, we created # the environment by calling the ``GymEnv`` wrapper. If extra keyword arguments # are passed, they will be transmitted to the ``gym.make`` method, hence covering -# the most common env construction commands. +# the most common environment construction commands. # Alternatively, one could also directly create a gym environment using ``gym.make(env_name, **kwargs)`` # and wrap it in a `GymWrapper` class. # # Also the ``device`` argument: for gym, this only controls the device where -# input action and observered states will be stored, but the execution will always +# input action and observed states will be stored, but the execution will always # be done on CPU. The reason for this is simply that gym does not support on-device # execution, unless specified otherwise. For other libraries, we have control over # the execution device and, as much as we can, we try to stay consistent in terms of @@ -232,8 +232,8 @@ # the policy. In Gym, this is usually achieved via wrappers. TorchRL takes a different # approach, more similar to other pytorch domain libraries, through the use of transforms. # To add transforms to an environment, one should simply wrap it in a :class:`TransformedEnv` -# instance, and append the sequence of transforms to it. The transformed env will inherit -# the device and meta-data of the wrapped env, and transform these depending on the sequence +# instance, and append the sequence of transforms to it. The transformed environment will inherit +# the device and meta-data of the wrapped environment, and transform these depending on the sequence # of transforms it contains. # # Normalization @@ -255,7 +255,7 @@ # to communicate. You could think of it as a python dictionary with some extra # tensor features. In practice, this means that many modules we will be working # with need to be told what key to read (``in_keys``) and what key to write -# (``out_keys``) in the tensordict they will receive. Usually, if ``out_keys`` +# (``out_keys``) in the ``tensordict`` they will receive. Usually, if ``out_keys`` # is omitted, it is assumed that the ``in_keys`` entries will be updated # in-place. For our transforms, the only entry we are interested in is referred # to as ``"observation"`` and our transform layers will be told to modify this @@ -295,7 +295,7 @@ # environment specs, but you can easily check that your environment specs are # adequate. # In our example, the :class:`GymWrapper` and :class:`GymEnv` that inherits -# from it already take care of setting the proper specs for your env so +# from it already take care of setting the proper specs for your environment so # you should not have to care about this. # # Nevertheless, let's see a concrete example using our transformed @@ -312,7 +312,7 @@ print("action_spec (as defined by input_spec):", env.action_spec) ###################################################################### -# the :func:`check_env_specs` function runs a small rollout and compares its output against the environemnt +# the :func:`check_env_specs` function runs a small rollout and compares its output against the environment # specs. If no error is raised, we can be confident that the specs are properly defined: # check_env_specs(env) @@ -328,7 +328,7 @@ # observation may be composite, meaning that it could be composed of more than one # tensor. This is not a problem for TorchRL, since the whole set of observations # is automatically packed in the output :class:`tensordict.TensorDict`. After executing a rollout -# (ie a sequence of environment steps and random action generations) over a given +# (for example, a sequence of environment steps and random action generations) over a given # number of steps, we will retrieve a :class:`tensordict.TensorDict` instance with a shape # that matches this trajectory length: # @@ -340,7 +340,7 @@ # Our rollout data has a shape of ``torch.Size([3])`, which matches the number of steps # we ran it for. The ``"next"`` entry points to the data coming after the current step. # In most cases, the ``"next""`` data at time `t` matches the data at ``t+1``, but this -# may not be the case if we are using some specific transformations (e.g. mutli-step). +# may not be the case if we are using some specific transformations (for example, multi-step). # # Policy # ------ @@ -360,13 +360,13 @@ # f_{\theta}(\text{observation}) = \mu_{\theta}(\text{observation}), \sigma^{+}_{\theta}(\text{observation}) # # The only extra-difficulty that is brought up here is to split our output in two -# equal parts and map the second to a scrictly positive space. +# equal parts and map the second to a strictly positive space. # # We design the policy in three steps: # # 1. Define a neural network ``D_obs`` -> ``2 * D_action``. Indeed, our ``loc`` (mu) and ``scale`` (sigma) both have dimension ``D_action``; # -# 2. Append a :class:`NormalParamExtractor` to extract a location and a scale (ie splits the input in two equal parts +# 2. Append a :class:`NormalParamExtractor` to extract a location and a scale (for example, splits the input in two equal parts # and applies a positive transformation to the scale parameter); # # 3. Create a probabilistic :class:`TensorDictModule` that can create this distribution and sample from it. @@ -384,7 +384,7 @@ ) ###################################################################### -# To enable the policy to "talk" with the environment through the tensordict +# To enable the policy to "talk" with the environment through the ``tensordict`` # data carrier, we wrap the ``nn.Module`` in a :class:`TensorDictModule`. This # class will simply ready the ``in_keys`` it is provided with and write the # outputs in-place at the registered ``out_keys``. @@ -429,7 +429,7 @@ # won't be used at inference time. This module will read the observations and # return an estimation of the discounted return for the following trajectory. # This allows us to amortize learning by relying on the some utility estimation -# that is learnt on-the-fly during training. Our value network share the same +# that is learned on-the-fly during training. Our value network share the same # structure as the policy, but for simplicity we assign it its own set of # parameters. # @@ -484,7 +484,7 @@ # As for the policy and environment before, the data collector will return # :class:`tensordict.TensorDict` instances with a total number of elements that will # match ``frames_per_batch``. Using :class:`tensordict.TensorDict` to pass data to the -# training loop allows you to write dataloading pipelines +# training loop allows you to write data loading pipelines # that are 100% oblivious to the actual specificities of the rollout content. # collector = SyncDataCollector( @@ -525,7 +525,7 @@ # Loss function # ------------- # -# The PPO loss can be directly imported from torchrl for convenience using the +# The PPO loss can be directly imported from TorchRL for convenience using the # :class:`ClipPPOLoss` class. This is the easiest way of utilizing PPO: # it hides away the mathematical operations of PPO and the control flow that # goes with it. @@ -536,7 +536,7 @@ # To compute the advantage, one just needs to (1) build the advantage module, which # utilizes our value operator, and (2) pass each batch of data through it before each # epoch. -# The GAE module will update the input tensordict with new ``"advantage"`` and +# The GAE module will update the input ``tensordict`` with new ``"advantage"`` and # ``"value_target"`` entries. # The ``"value_target"`` is a gradient-free tensor that represents the empirical # value that the value network should represent with the input observation. @@ -612,7 +612,7 @@ + loss_vals["loss_entropy"] ) - # Optimization: backward, grad clipping and optim step + # Optimization: backward, grad clipping and optimization step loss_value.backward() # this is not strictly mandatory but it's good practice to keep # your gradient norm bounded @@ -633,8 +633,8 @@ # We evaluate the policy once every 10 batches of data. # Evaluation is rather simple: execute the policy without exploration # (take the expected value of the action distribution) for a given - # number of steps (1000, which is our env horizon). - # The ``rollout`` method of the env can take a policy as argument: + # number of steps (1000, which is our ``env`` horizon). + # The ``rollout`` method of the ``env`` can take a policy as argument: # it will then execute this policy at each step. with set_exploration_mode("mean"), torch.no_grad(): # execute a rollout with the trained policy diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 083ce07f77a..78dc7e2fc6e 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -17,9 +17,9 @@ `Gymnasium's website `__. .. figure:: /_static/img/cartpole.gif - :alt: cartpole + :alt: CartPole - cartpole + CartPole As the agent observes the current state of the environment and chooses an action, the environment *transitions* to a new state, and also @@ -45,7 +45,7 @@ `gymnasium `__ for the environment, installed by using `pip`. This is a fork of the original OpenAI Gym project and maintained by the same team since Gym v0.19. -If you are running this in Google colab, run: +If you are running this in Google Colab, run: .. code-block:: bash @@ -82,7 +82,7 @@ plt.ion() -# if gpu is to be used +# if GPU is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -96,7 +96,7 @@ # batch are decorrelated. It has been shown that this greatly stabilizes # and improves the DQN training procedure. # -# For this, we're going to need two classses: +# For this, we're going to need two classes: # # - ``Transition`` - a named tuple representing a single transition in # our environment. It essentially maps (state, action) pairs @@ -172,7 +172,7 @@ def __len__(self): # # .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a)) # -# To minimise this error, we will use the `Huber +# To minimize this error, we will use the `Huber # loss `__. The Huber loss acts # like the mean squared error when the error is small, but like the mean # absolute error when the error is large - this makes it more robust to @@ -233,7 +233,7 @@ def forward(self, x): # probability of choosing a random action will start at ``EPS_START`` # and will decay exponentially towards ``EPS_END``. ``EPS_DECAY`` # controls the rate of the decay. -# - ``plot_durations`` - a helper for plotting the durations of episodes, +# - ``plot_durations`` - a helper for plotting the duration of episodes, # along with an average over the last 100 episodes (the measure used in # the official evaluations). The plot will be underneath the cell # containing the main training loop, and will update after every @@ -246,7 +246,7 @@ def forward(self, x): # EPS_END is the final value of epsilon # EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay # TAU is the update rate of the target network -# LR is the learning rate of the AdamW optimizer +# LR is the learning rate of the ``AdamW`` optimizer BATCH_SIZE = 128 GAMMA = 0.99 EPS_START = 0.9 @@ -391,7 +391,7 @@ def optimize_model(): # # Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 # episodes are scheduled so training does not take too long. However, 50 -# episodes is insufficient for to observe good performance on cartpole. +# episodes is insufficient for to observe good performance on CartPole. # You should see the model constantly achieve 500 steps within 600 training # episodes. Training RL agents can be a noisy process, so restarting training # can produce better results if convergence is not observed. diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py index fbc76a15799..669e516f2c2 100644 --- a/intermediate_source/scaled_dot_product_attention_tutorial.py +++ b/intermediate_source/scaled_dot_product_attention_tutorial.py @@ -88,7 +88,7 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs): # Lets explore the speed of each of the 3 implementations from torch.backends.cuda import sdp_kernel, SDPBackend -# Helpful arg mapper +# Helpful arguments mapper backend_map = { SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False}, SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False}, @@ -130,8 +130,8 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs): # ~~~~~~~~~~~~~~~~~~~~~ # # Below is an example implementation of a multi-headed causal self -# attention block inspired by Andrej Karpathy’s -# `NanoGPT `__ repository. +# attention block inspired by +# `Andrej Karpathy NanoGPT `__ repository. # class CausalSelfAttention(nn.Module): @@ -186,12 +186,12 @@ def forward(self, x): print(model) -###################################################################### -# NestedTensor and Dense tensor support -# ------------------------------------- +##################################################################### +# ``NestedTensor`` and Dense tensor support +# ----------------------------------------- # -# SDPA supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences -# without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensors see +# SDPA supports both ``NestedTensor`` and Dense tensor inputs. ``NestedTensors`` handle the case where the input is a batch of variable length sequences +# without needing to pad each sequence to the maximum length in the batch. For more information about ``NestedTensors`` see # `torch.nested `__ and `NestedTensors Tutorial `__. # @@ -236,7 +236,7 @@ def generate_rand_batch( random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device) random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device) -# Currently the fused implementations don't support NestedTensor for training +# Currently the fused implementations don't support ``NestedTensor`` for training model.eval() with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]): @@ -248,14 +248,14 @@ def generate_rand_batch( ###################################################################### -# Using SDPA with torch.compile -# ============================ +# Using SDPA with ``torch.compile`` +# ================================= # # With the release of PyTorch 2.0, a new feature called # ``torch.compile()`` has been introduced, which can provide # significant performance improvements over eager mode. # Scaled dot product attention is fully composable with ``torch.compile()``. -# To demonstrate this, let's compile the CausalSelfAttention module using +# To demonstrate this, let's compile the ``CausalSelfAttention`` module using # ``torch.compile()`` and observe the resulting performance improvements. # @@ -303,7 +303,9 @@ def generate_rand_batch( print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results -# prof.export_chrome_trace("compiled_causal_attention_trace.json"). +# :: +# +# prof.export_chrome_trace("compiled_causal_attention_trace.json"). @@ -315,15 +317,14 @@ def generate_rand_batch( # on the same set of functions for both modules. # The reason for this here is that ``torch.compile`` is very good at removing the # framework overhead associated with PyTorch. If your model is launching -# large, efficient CUDA kernels, which in this case CausaulSelfAttention +# large, efficient CUDA kernels, which in this case ``CausaulSelfAttention`` # is, then the overhead of PyTorch can be hidden. # # In reality, your module does not normally consist of a singular -# CausalSelfAttention block. When experimenting with Andrej Karpathy’s -# `NanoGPT `__ repository, compiling +# ``CausalSelfAttention`` block. When experimenting with `Andrej Karpathy NanoGPT `__ repository, compiling # the module took the time per train step from: ``6090.49ms`` to -# ``3273.17ms``! This was done on commit: ae3a8d5 of NanoGPT training on -# the shakespeare dataset. +# ``3273.17ms``! This was done on commit: ``ae3a8d5`` of NanoGPT training on +# the Shakespeare dataset. # @@ -335,7 +336,7 @@ def generate_rand_batch( # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how # the ``sdp_kernel`` context manager can be used to assert a certain # implementation is used on GPU. As well, we built a simple -# CausalSelfAttention module that works with NestedTensor and is torch +# ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch # compilable. In the process we have shown how to the profiling tools can # be used to explore the performance characteristics of a user defined # module. diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py index e8a5651c57b..853cb2aed45 100644 --- a/intermediate_source/seq2seq_translation_tutorial.py +++ b/intermediate_source/seq2seq_translation_tutorial.py @@ -106,7 +106,7 @@ # yet, someone did the extra work of splitting language pairs into # individual text files here: https://www.manythings.org/anki/ # -# The English to French pairs are too big to include in the repo, so +# The English to French pairs are too big to include in the repository, so # download to ``data/eng-fra.txt`` before continuing. The file is a tab # separated list of translation pairs: # @@ -301,10 +301,10 @@ def prepareData(lang1, lang2, reverse=False): # length and order, which makes it ideal for translation between two # languages. # -# Consider the sentence "Je ne suis pas le chat noir" → "I am not the -# black cat". Most of the words in the input sentence have a direct +# Consider the sentence ``Je ne suis pas le chat noir`` → ``I am not the +# black cat``. Most of the words in the input sentence have a direct # translation in the output sentence, but are in slightly different -# orders, e.g. "chat noir" and "black cat". Because of the "ne/pas" +# orders, e.g. ``chat noir`` and ``black cat``. Because of the ``ne/pas`` # construction there is also one more word in the input sentence. It would # be difficult to produce a correct translation directly from the sequence # of input words. @@ -844,8 +844,8 @@ def evaluateAndShowAttention(input_sentence): # - Chat → Response # - Question → Answer # -# - Replace the embeddings with pre-trained word embeddings such as word2vec or -# GloVe +# - Replace the embeddings with pretrained word embeddings such as ``word2vec`` or +# ``GloVe`` # - Try with more layers, more hidden units, and more sentences. Compare # the training time and results. # - If you use a translation file where pairs have two of the same phrase diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py index 7cd241d40ad..440f2257e1a 100644 --- a/intermediate_source/tensorboard_profiler_tutorial.py +++ b/intermediate_source/tensorboard_profiler_tutorial.py @@ -54,7 +54,7 @@ ###################################################################### # Then prepare the input data. For this tutorial, we use the CIFAR10 dataset. -# Transform it to the desired format and use DataLoader to load each batch. +# Transform it to the desired format and use ``DataLoader`` to load each batch. transform = T.Compose( [T.Resize(224), @@ -116,7 +116,7 @@ def train(data): # - ``profile_memory`` - Track tensor memory allocation/deallocation. Note, for old version of pytorch with version # before 1.10, if you suffer long profiling time, please disable it or upgrade to new version. # - ``with_stack`` - Record source information (file and line number) for the ops. -# If the TensorBoard is launched in VSCode (`reference `_), +# If the TensorBoard is launched in VS Code (`reference `_), # clicking a stack frame will navigate to the specific code line. with torch.profiler.profile( @@ -217,13 +217,13 @@ def train(data): # The "Total" duration includes its child operators’ time. # # - View call stack -# Click the "View Callstack" of an operator, the operators with same name but different call stacks will be shown. -# Then click a "View Callstack" in this sub-table, the call stack frames will be shown. +# Click the ``View Callstack`` of an operator, the operators with same name but different call stacks will be shown. +# Then click a ``View Callstack`` in this sub-table, the call stack frames will be shown. # # .. image:: ../../_static/img/profiler_callstack.png # :scale: 25 % # -# If the TensorBoard is launched inside VSCode +# If the TensorBoard is launched inside VS Code # (`Launch Guide `_), # clicking a call stack frame will navigate to the specific code line. # @@ -279,8 +279,8 @@ def train(data): # 5. Improve performance with the help of profiler # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is DataLoader. -# The PyTorch DataLoader uses single process by default. +# At the bottom of "Overview" page, the suggestion in "Performance Recommendation" hints the bottleneck is ``DataLoader``. +# The PyTorch ``DataLoader`` uses single process by default. # User could enable multi-process data loading by setting the parameter ``num_workers``. # `Here `_ is more details. # @@ -350,7 +350,7 @@ def train(data): # In the memory events table, the allocation and release events are paired into one entry. The "operator" column shows # the immediate ATen operator that is causing the allocation. Notice that in PyTorch, ATen operators commonly use # ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an -# ``aten::fill_``. Solely display the opeartor name as ``aten::empty`` is of little help. It will be shown as +# ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as # ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration" # columns' data might be missing if the event occurs outside of the time range. # diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index ad1c5d41be9..fcea4ed6611 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -41,7 +41,7 @@ # Basic Usage # ------------ # -# ``torch.compile`` is included in the latest PyTorch nightlies. +# ``torch.compile`` is included in the latest PyTorch.. # Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly # binary. If Triton is still missing, try installing ``torchtriton`` via pip # (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"`` @@ -125,7 +125,7 @@ def init_model(): # First, let's compare inference. # # Note that in the call to ``torch.compile``, we have have the additional -# ``mode`` kwarg, which we will discuss below. +# ``mode`` argument, which we will discuss below. def evaluate(mod, inp): return mod(inp) @@ -184,7 +184,7 @@ def evaluate(mod, inp): # GPU compute and the observed speedup may be less significant. # # You may also see different speedup results depending on the chosen ``mode`` -# kwarg. Since our model and data are small, we want to reduce overhead as +# argument. Since our model and data are small, we want to reduce overhead as # much as possible, and so we chose ``"reduce-overhead"``. For your own models, # you may need to experiment with different modes to maximize speedup. You can # read more about modes `here `__. From 836a537b2a7eae38b0c272772838ed42ce99744e Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 20 Apr 2023 15:30:40 -0700 Subject: [PATCH 4/7] Update --- .pyspelling.yml | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/.pyspelling.yml b/.pyspelling.yml index bce94f383cc..ad1c666d89d 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -4,34 +4,6 @@ matrix: sources: - beginner_source/*.py - intermediate_source/*.py - #- intermediate_source/autograd_saved_tensors_hooks_tutorial.py - #- intermediate_source/ax_multiobjective_nas_tutorial.py - #- intermediate_source/char_rnn_classification_tutorial.py - #- intermediate_source/char_rnn_generation_tutorial.py - #- intermediate_source/custom_function_conv_bn_tutorial.py - #- intermediate_source/ensembling.py - #- intermediate_source/flask_rest_api_tutorial.py - #- intermediate_source/forward_ad_usage.py - #- intermediate_source/fx_conv_bn_fuser.py - #- intermediate_source/fx_profiling_tutorial.py - #- intermediate_source/jacobians_hessians.py - #- intermediate_source/mario_rl_tutorial.py - #- intermediate_source/mnist_train_nas.py - #- intermediate_source/memory_format_tutorial.py - #- intermediate_source/model_parallel_tutorial.py - #- intermediate_source/neural_tangent_kernels.py - #- intermediate_source/nvfuser_intro_tutorial.py - #- intermediate_source/parametrizations.py - #- intermediate_source/per_sample_grads.py - #- intermediate_source/pipeline_tutorial.py - #- intermediate_source/pruning_tutorial.py - #- intermediate_source/reinforcement_ppo.py - #- intermediate_source/reinforcement_q_learning.py - #- intermediate_source/scaled_dot_product_attention_tutorial.py - #- intermediate_source/seq2seq_translation_tutorial.py - #- intermediate_source/spatial_transformer_tutorial.py - #- intermediate_source/tensorboard_profiler_tutorial.py - #- intermediate_source/torch_compile_tutorial.py dictionary: wordlists: - en-wordlist.txt From 37b17f5c04166adbbb4a5bb17cd31a7a107a4fd0 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 20 Apr 2023 15:31:48 -0700 Subject: [PATCH 5/7] Update --- .pyspelling.yml | 2 -- en-wordlist.txt | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.pyspelling.yml b/.pyspelling.yml index ad1c666d89d..ffe9f469d03 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -24,8 +24,6 @@ matrix: - open: '\.\. (raw)::.*$\n*' close: '\n' # Exclude - - open: '.*(:py:mod:).*' - close: ' ' # Exclude Python coding directives - open: '-\*- coding:' close: '\n' diff --git a/en-wordlist.txt b/en-wordlist.txt index d8ddb614157..025098fd7ee 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,8 +1,3 @@ -UI -bytecode -TorchInductor -unoptimized -TorchDynamo APIs ATen Args @@ -125,12 +120,15 @@ TPU TensorBoard TextVQA Tokenization +TorchDynamo +TorchInductor TorchMultimodal TorchRL TorchRL's TorchScript TorchX Tunable +UI Unescape VQA VS Code @@ -160,6 +158,7 @@ batchnorm's benchmarking boolean broadcasted +bytecode cardinality chatbot chatbot's @@ -351,6 +350,7 @@ uncommented unfused unimodal unnormalized +unoptimized unparametrized unpickling unpruned From a2d6d2650d368b9b8700801390937dc397633a0e Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 20 Apr 2023 15:35:05 -0700 Subject: [PATCH 6/7] Update --- 1 | 341 -------------------------------------------------------------- 1 file changed, 341 deletions(-) delete mode 100644 1 diff --git a/1 b/1 deleted file mode 100644 index 996a909cd88..00000000000 --- a/1 +++ /dev/null @@ -1,341 +0,0 @@ -Andrej -Karpathy's -NanoGPT -compilable -decorrelated -DQN -deterministically -approximators -duration -CartPole -EPS -APIs -Args -Autograd -BCE -BN -BOS -Bahdanau -BatchNorm -CHW -CIFAR -CLS -CNNDM -CNNs -CPUs -CUDA -Cayley -Chatbots -Colab -Conv -ConvNet -ConvNets -DCGAN -DCGANs -DDQN -DNN -DataLoaders -DeepMind -DeiT -DenseNet -EOS -FC -FGSM -FLAVA -FX -FX's -FloydHub -FloydHub's -Frobenius -GAE -GAN -GANs -GPU's -GPUs -GRU -GRUs -GTC -GeForce -Goodfellow -Goodfellow’s -GreedySearchDecoder -HVP -Hugging Face -IMDB -ImageNet -Initializations -Iteratively -JSON -JVP -Jacobian -Kiuk -Kubernetes -Kuei -LSTM -LSTMs -LeCun -LeNet -LeakyReLU -LeakyReLUs -Lipschitz -Lua -Luong -MLP -MLPs -MNIST -Mypy -NAS -NCHW -NES -NLP -NTK -NaN -NeurIPS -NumPy -Numericalization -Numpy's -OpenAI -PPO -Plotly -Prec -Profiler -PyTorch's -RGB -RL -RNN -RNNs -RPC -RTX -Radford -ReLU -ResNet -SPD -SST2 -Sequentials -Sigmoid -SoTA -TPU -TensorBoard -TextVQA -Tokenization -TorchMultimodal -TorchRL -TorchRL's -TorchScript -TorchX -Tunable -Unescape -VQA -Wikitext -Xeon -accuracies -activations -adversarially -al -autodiff -autograd -backend -backends -backprop -backpropagate -backpropagated -backpropagates -backpropagation -batchnorm -batchnorm's -benchmarking -boolean -broadcasted -cardinality -chatbot -chatbot's -checkpointing -composable -concat -config -contrastive -conv -convolutional -cpu -csv -cuDNN -datafile -dataframe -dataloader -dataloaders -datapipes -dataset -datasets -dataset’s -deserialize -deserialized -dimensionality -dir -downsample -downsamples -embeddings -encodings -ensembling -eq -et -evaluateInput -extensibility -fastai -fbgemm -feedforward -finetune -finetuning -fp -functorch -fuser -geomean -grayscale -hardcode -helpdesk -helpdesks -hessian -hessians -hvp -hyperparameter -hyperparameters -imagenet -initializations -inlined -interpretable -io -iterable -iteratively -jacobian -jacobians -jit -jpg -kwargs -labelled -learnable -learnings -loadFilename -manualSeed -matplotlib -minibatch -minibatches -minimax -misclassification -misclassified -modularity -modularized -multihead -multimodal -multimodality -multiobjective -multiprocessed -multithreaded -namespace -natively -ndarrays -num -numericalize -numpy -nvFuser -nvFuser's -optimizable -optimizer's -optimizers -overfitting -parallelizable -parallelization -parametrization -parametrizations -parametrized -parametrizing -perceptibility -pipelining -pointwise -precompute -precomputing -prepend -preprocess -preprocessing -prepruned -prespecified -pretrained -prewritten -primals -profiler -profilers -pytorch -quantized -quantizing -queryable -randint -readably -recomputation -reimplement -reimplementing -reimplements -reinitializes -relu -reproducibility -rescale -resnet -restride -rewinded -rollout -romanized -runnable -runtime -runtime -runtimes -scalable -softmax -sparsify -specificities -src -stacktrace -stateful -storages -strided -subclasses -subclassing -subdirectories -submodule -submodules -subnetworks -subreddit -summarization -tanh -th -thresholding -timestep -timesteps -tokenization -tokenize -tokenizer -topologies -torchaudio -torchdata -torchscriptable -torchtext -torchtext's -torchvision -torchviz -traceback -tradeoff -tradeoffs -uncomment -uncommented -unfused -unimodal -unnormalized -unparametrized -unpickling -unpruned -updation -utils -vectorization -vectorize -vectorized -vhp -voc -walkthrough -warmstart -warmstarting From 16d01dc2505caeaba60c76fefa1e7d347c7ef2c6 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 20 Apr 2023 16:15:01 -0700 Subject: [PATCH 7/7] Fix --- intermediate_source/pipeline_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py index 444eddf8415..33561f60592 100644 --- a/intermediate_source/pipeline_tutorial.py +++ b/intermediate_source/pipeline_tutorial.py @@ -174,7 +174,7 @@ def data_process(raw_text_iter): def batchify(data, bsz): # Divide the dataset into ``bsz`` parts. - nbatch = data.size(0) // ``bsz`` + nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the ``bsz` batches.