From 1ceb5e6915d265018003f115118ecbc68a59a920 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Mon, 17 Apr 2023 16:09:04 -0700 Subject: [PATCH 1/3] Pyspelling: Python intermediate tutorials A-M --- .pyspelling.yml | 15 ++++ en-wordlist.txt | 81 +++++++++++++++++++ .../autograd_saved_tensors_hooks_tutorial.py | 28 ++++--- .../ax_multiobjective_nas_tutorial.py | 46 +++++------ .../char_rnn_classification_tutorial.py | 10 +-- .../char_rnn_generation_tutorial.py | 6 +- .../custom_function_conv_bn_tutorial.py | 58 ++++++------- intermediate_source/ensembling.py | 30 +++---- .../flask_rest_api_tutorial.py | 4 +- intermediate_source/forward_ad_usage.py | 28 +++---- intermediate_source/fx_conv_bn_fuser.py | 8 +- intermediate_source/fx_profiling_tutorial.py | 10 +-- intermediate_source/jacobians_hessians.py | 58 ++++++------- intermediate_source/mario_rl_tutorial.py | 21 +++-- intermediate_source/memory_format_tutorial.py | 46 +++++------ intermediate_source/mnist_train_nas.py | 2 +- .../model_parallel_tutorial.py | 6 +- 17 files changed, 276 insertions(+), 181 deletions(-) diff --git a/.pyspelling.yml b/.pyspelling.yml index 015ac975b7f..9dce7c8215a 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -3,6 +3,21 @@ matrix: - name: python sources: - beginner_source/*.py + - intermediate_source/autograd_saved_tensors_hooks_tutorial.py + - intermediate_source/ax_multiobjective_nas_tutorial.py + - intermediate_source/char_rnn_classification_tutorial.py + - intermediate_source/char_rnn_generation_tutorial.py + - intermediate_source/custom_function_conv_bn_tutorial.py + - intermediate_source/ensembling.py + #- intermediate_source/flask_rest_api_tutorial.py + - intermediate_source/forward_ad_usage.py + - intermediate_source/fx_conv_bn_fuser.py + - intermediate_source/fx_profiling_tutorial.py + - intermediate_source/jacobians_hessians.py + - intermediate_source/mario_rl_tutorial.py + - intermediate_source/mnist_train_nas.py + - intermediate_source/memory_format_tutorial.py + - intermediate_source/model_parallel_tutorial.py dictionary: wordlists: - en-wordlist.txt diff --git a/en-wordlist.txt b/en-wordlist.txt index c1447668122..7fd34f1ee56 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -2,6 +2,7 @@ APIs Args Autograd BCE +BN BOS Bahdanau BatchNorm @@ -10,18 +11,26 @@ CIFAR CLS CNNDM CNNs +CPUs CUDA Chatbots Colab Conv ConvNet +ConvNets DCGAN DCGANs +DDQN +DNN DataLoaders DeiT +DenseNet EOS +FC FGSM FLAVA +FX +FX's FloydHub FloydHub's GAN @@ -29,57 +38,85 @@ GANs GPUs GRU GRUs +GeForce Goodfellow Goodfellow’s GreedySearchDecoder +HVP Hugging Face IMDB ImageNet Initializations Iteratively JSON +JVP +Jacobian +Kiuk +Kubernetes Kuei LSTM +LSTMs LeNet LeakyReLU LeakyReLUs +Lua Luong MLP +MLPs MNIST Mypy +NAS +NCHW +NES NLP NaN NeurIPS NumPy Numericalization Numpy's +OpenAI +Plotly +Prec Profiler PyTorch's RGB +RL RNN RNNs +RTX Radford ReLU +ResNet SST2 +Sequentials Sigmoid SoTA +TPU TensorBoard TextVQA Tokenization TorchMultimodal TorchScript +TorchX +Tunable Unescape VQA Wikitext +Xeon accuracies activations adversarially al +autodiff +autograd backend +backends backprop +backpropagate backpropagated backpropagates backpropagation +batchnorm batchnorm's benchmarking boolean @@ -89,12 +126,15 @@ chatbot's checkpointing composable concat +config contrastive conv convolutional cpu csv +cuDNN datafile +dataframe dataloader dataloaders datapipes @@ -105,26 +145,43 @@ deserialize deserialized dir downsample +downsamples embeddings encodings +ensembling eq et evaluateInput +extensibility fastai fbgemm feedforward finetune finetuning +fp +functorch +fuser +grayscale +hardcode helpdesk helpdesks +hessian +hessians +hvp hyperparameter hyperparameters imagenet +initializations +inlined +interpretable io iterable iteratively +jacobian +jacobians jit jpg +kwargs labelled learnable loadFilename @@ -139,6 +196,7 @@ modularity modularized multimodal multimodality +multiobjective multithreaded namespace natively @@ -153,26 +211,37 @@ overfitting parallelizable parallelization perceptibility +pipelining +pointwise +precomputing prepend preprocess preprocessing +prespecified pretrained prewritten +primals profiler profilers pytorch quantized quantizing +queryable randint readably reinitializes relu reproducibility rescale +resnet +restride rewinded +romanized +runnable runtime runtime runtimes +scalable softmax src stacktrace @@ -180,29 +249,41 @@ stateful storages strided subclasses +subclassing subdirectories submodule +subreddit summarization tanh th thresholding +timestep +timesteps tokenization tokenize tokenizer torchaudio torchdata +torchscriptable torchtext torchtext's torchvision +torchviz traceback tradeoff +tradeoffs uncomment uncommented +unfused unimodal unnormalized unpickling +updation utils +vectorization +vectorize vectorized +vhp voc walkthrough warmstart diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py index f549301626d..f16b170ee6a 100644 --- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py +++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py @@ -1,6 +1,6 @@ """ Hooks for autograd saved tensors -======================= +================================ """ @@ -13,8 +13,7 @@ # packing/unpacking process. # # This tutorial assumes you are familiar with how backpropagation works in -# theory. If not, read this first: -# https://colab.research.google.com/drive/1aWNdmYt7RcHMbUk-Xz2Cv5-cGFSWPXe0#scrollTo=AHcEJ6nXUb7W +# theory. If not, read `this `_ first. # @@ -107,7 +106,7 @@ def f(x): ###################################################################### # In the example above, executing without grad would only have kept ``x`` -# and ``y`` in the scope, But the graph additionnally stores ``f(x)`` and +# and ``y`` in the scope, But the graph additionally stores ``f(x)`` and # ``f(f(x))``. Hence, running a forward pass during training will be more # costly in memory usage than during evaluation (more precisely, when # autograd is not required). @@ -182,7 +181,7 @@ def unpack_hook(x): ###################################################################### -# The ``pack_hook`` function will be called everytime an operation saves +# The ``pack_hook`` function will be called every time an operation saves # a tensor for backward. # The output of ``pack_hook`` is then stored in the computation graph # instead of the original tensor. @@ -218,8 +217,9 @@ def unpack_hook(x): # ###################################################################### -# **Returning and int** - +# Returning an ``int`` +# ^^^^^^^^^^^^^^^^^^^^ +# # Returning the index of a Python list # Relatively harmless but with debatable usefulness @@ -240,8 +240,9 @@ def unpack(x): assert(x.grad.equal(2 * x)) ###################################################################### -# **Returning a tuple** - +# Returning a tuple +# ^^^^^^^^^^^^^^^^^ +# # Returning some tensor and a function how to unpack it # Quite unlikely to be useful in its current form @@ -262,9 +263,10 @@ def unpack(packed): assert(torch.allclose(x.grad, 2 * x)) ###################################################################### -# **Returning a str** - -# Returning the __repr__ of the tensor +# Returning a ``str`` +# ^^^^^^^^^^^^^^^^^^^ +# +# Returning the ``__repr__ of`` the tensor # Probably never do this x = torch.randn(5, requires_grad=True) @@ -337,7 +339,7 @@ def forward(self, x): ###################################################################### -# In practice, on a A100 GPU, for a resnet-152 with batch size 256, this +# In practice, on a A100 GPU, for a ResNet-152 with batch size 256, this # corresponds to a GPU memory usage reduction from 48GB to 5GB, at the # cost of a 6x slowdown. # diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py index 7c43f59473c..79b096b9e64 100644 --- a/intermediate_source/ax_multiobjective_nas_tutorial.py +++ b/intermediate_source/ax_multiobjective_nas_tutorial.py @@ -48,7 +48,7 @@ # Our goal is to optimize the PyTorch Lightning training job defined in # `mnist_train_nas.py `__. # To do this using TorchX, we write a helper function that takes in -# the values of the architcture and hyperparameters of the training +# the values of the architecture and hyperparameters of the training # job and creates a `TorchX AppDef `__ # with the appropriate settings. # @@ -72,12 +72,12 @@ def trainer( trial_idx: int = -1, ) -> specs.AppDef: - # define the log path so we can pass it to the TorchX AppDef + # define the log path so we can pass it to the TorchX ``AppDef`` if trial_idx >= 0: log_path = Path(log_path).joinpath(str(trial_idx)).absolute().as_posix() return utils.python( - # command line args to the training script + # command line arguments to the training script "--log_path", log_path, "--hidden_size_1", @@ -126,15 +126,15 @@ def trainer( tracker_base="/tmp/", component=trainer, # NOTE: To launch this job on a cluster instead of locally you can - # specify a different scheduler and adjust args appropriately. + # specify a different scheduler and adjust arguments appropriately. scheduler="local_cwd", component_const_params={"log_path": log_dir}, cfg={}, ) ###################################################################### -# Setting up the SearchSpace -# -------------------------- +# Setting up the ``SearchSpace`` +# ------------------------------ # # First, we define our search space. Ax supports both range parameters # of type integer and float as well as choice parameters which can have @@ -154,7 +154,7 @@ def trainer( parameters = [ # NOTE: In a real-world setting, hidden_size_1 and hidden_size_2 # should probably be powers of 2, but in our simple example this - # would mean that num_params can't take on that many values, which + # would mean that ``num_params`` can't take on that many values, which # in turn makes the Pareto frontier look pretty weird. RangeParameter( name="hidden_size_1", @@ -189,7 +189,7 @@ def trainer( upper=0.5, parameter_type=ParameterType.FLOAT, ), - ChoiceParameter( # NOTE: ChoiceParameters don't require log-scale + ChoiceParameter( # NOTE: ``ChoiceParameters`` don't require log-scale name="batch_size", values=[32, 64, 128, 256], parameter_type=ParameterType.INT, @@ -212,7 +212,7 @@ def trainer( # # Ax has the concept of a `Metric `__ # that defines properties of outcomes and how observations are obtained -# for these outcomes. This allows e.g. encodig how data is fetched from +# for these outcomes. This allows e.g. encoding how data is fetched from # some distributed execution backend and post-processed before being # passed as input to Ax. # @@ -229,7 +229,7 @@ def trainer( # index (see the ``trainer()`` function above). We will define a metric # class that is aware of that logging directory. By subclassing # `TensorboardCurveMetric `__ -# we get the logic to read and parse the Tensorboard logs for free. +# we get the logic to read and parse the TensorBoard logs for free. # from ax.metrics.tensorboard import TensorboardCurveMetric @@ -237,10 +237,10 @@ def trainer( class MyTensorboardMetric(TensorboardCurveMetric): - # NOTE: We need to tell the new Tensorboard metric how to get the id / - # file handle for the tensorboard logs from a trial. In this case + # NOTE: We need to tell the new TensorBoard metric how to get the id / + # file handle for the TensorBoard logs from a trial. In this case # our convention is to just save a separate file per trial in - # the pre-specified log dir. + # the prespecified log dir. @classmethod def get_ids_from_trials(cls, trials): return { @@ -257,9 +257,9 @@ def is_available_while_running(cls): ###################################################################### -# Now we can instatiate the metrics for accuracy and the number of +# Now we can instantiate the metrics for accuracy and the number of # model parameters. Here `curve_name` is the name of the metric in the -# Tensorboard logs, while `name` is the metric name used internally +# TensorBoard logs, while `name` is the metric name used internally # by Ax. We also specify `lower_is_better` to indicate the favorable # direction of the two metrics. # @@ -277,8 +277,8 @@ def is_available_while_running(cls): ###################################################################### -# Setting up the OptimizationConfig -# ---------------------------------- +# Setting up the ``OptimizationConfig`` +# ------------------------------------- # # The way to tell Ax what it should optimize is by means of an # `OptimizationConfig `__. @@ -335,8 +335,8 @@ def is_available_while_running(cls): ) ###################################################################### -# Choosing the GenerationStrategy -# ------------------------------- +# Choosing the Generation Strategy +# -------------------------------- # # A `GenerationStrategy `__ # is the abstract representation of how we would like to perform the @@ -366,7 +366,7 @@ def is_available_while_running(cls): # Configuring the Scheduler # ------------------------- # -# The `Scheduler` (TODO: link) acts as the loop control for the optimization. +# The ``Scheduler`` acts as the loop control for the optimization. # It communicates with the backend to launch trials, check their status, # and retrieve results. In the case of this tutorial, it is simply reading # and parsing the locally saved logs. In a remote execution setting, @@ -404,7 +404,7 @@ def is_available_while_running(cls): # ------------------------ # # Now that everything is configured, we can let Ax run the optimization -# in a fully automated fashion. The Scheduler will periodially check +# in a fully automated fashion. The Scheduler will periodically check # the logs for the status of all currently running trials, and if a # trial completes the scheduler will update its status on the # experiment and fetch the observations needed for the Bayesian @@ -479,7 +479,7 @@ def is_available_while_running(cls): from ax.plot.diagnostic import interact_cross_validation_plotly from ax.utils.notebook.plotting import init_notebook_plotting, render -cv = cross_validate(model=gs.model) # The surrogate model is stored on the GenerationStrategy +cv = cross_validate(model=gs.model) # The surrogate model is stored on the ``GenerationStrategy`` compute_diagnostics(cv) interact_cross_validation_plotly(cv) @@ -508,7 +508,7 @@ def is_available_while_running(cls): ###################################################################### -# Acknowledgements +# Acknowledgments # ---------------- # # We thank the TorchX team (in particular Kiuk Chung and Tristan Rice) diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py index 78cbc111151..f36b92fb17e 100644 --- a/intermediate_source/char_rnn_classification_tutorial.py +++ b/intermediate_source/char_rnn_classification_tutorial.py @@ -61,7 +61,7 @@ and extract it to the current directory. Included in the ``data/names`` directory are 18 text files named as -"[Language].txt". Each file contains a bunch of names, one name per +``[Language].txt``. Each file contains a bunch of names, one name per line, mostly romanized (but we still need to convert from Unicode to ASCII). @@ -179,7 +179,7 @@ def lineToTensor(line): # tutorial `__) # is just 2 linear layers which operate on an input and hidden state, with -# a LogSoftmax layer after the output. +# a ``LogSoftmax`` layer after the output. # # .. figure:: https://i.imgur.com/Z2xbySO.png # :alt: @@ -230,7 +230,7 @@ def initHidden(self): # For the sake of efficiency we don't want to be creating a new Tensor for # every step, so we will use ``lineToTensor`` instead of # ``letterToTensor`` and use slices. This could be further optimized by -# pre-computing batches of Tensors. +# precomputing batches of Tensors. # input = lineToTensor('Albert') @@ -372,7 +372,7 @@ def timeSince(since): output, loss = train(category_tensor, line_tensor) current_loss += loss - # Print iter number, loss, name and guess + # Print ``iter`` number, loss, name and guess if iter % print_every == 0: guess, guess_i = categoryFromOutput(output) correct = '✓' if guess == category else '✗ (%s)' % category @@ -495,7 +495,7 @@ def predict(input_line, n_predictions=3): # - ``model.py`` (defines the RNN) # - ``train.py`` (runs training) # - ``predict.py`` (runs ``predict()`` with command line arguments) -# - ``server.py`` (serve prediction as a JSON API with bottle.py) +# - ``server.py`` (serve prediction as a JSON API with ``bottle.py``) # # Run ``train.py`` to train and save the network. # diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py index ee7b0d14fd3..431c2bf43d9 100644 --- a/intermediate_source/char_rnn_generation_tutorial.py +++ b/intermediate_source/char_rnn_generation_tutorial.py @@ -234,7 +234,7 @@ def inputTensor(line): tensor[li][0][all_letters.find(letter)] = 1 return tensor -# LongTensor of second letter to end (EOS) for target +# ``LongTensor`` of second letter to end (EOS) for target def targetTensor(line): letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))] letter_indexes.append(n_letters - 1) # EOS @@ -322,7 +322,7 @@ def timeSince(since): print_every = 5000 plot_every = 500 all_losses = [] -total_loss = 0 # Reset every plot_every iters +total_loss = 0 # Reset every ``plot_every`` ``iters`` start = time.time() @@ -429,6 +429,6 @@ def samples(category, start_letters='ABC'): # choosing a start letter # - Get better results with a bigger and/or better shaped network # -# - Try the nn.LSTM and nn.GRU layers +# - Try the ``nn.LSTM`` and ``nn.GRU`` layers # - Combine multiple of these RNNs as a higher level network # diff --git a/intermediate_source/custom_function_conv_bn_tutorial.py b/intermediate_source/custom_function_conv_bn_tutorial.py index 8838fc0d32a..a9fcd8838ae 100644 --- a/intermediate_source/custom_function_conv_bn_tutorial.py +++ b/intermediate_source/custom_function_conv_bn_tutorial.py @@ -35,7 +35,7 @@ For simplicity, in this tutorial we hardcode `bias=False`, `stride=1`, `padding=0`, `dilation=1`, and `groups=1` for Conv2D. For BatchNorm2D, we hardcode `eps=1e-3`, `momentum=0.1`, `affine=False`, and `track_running_statistics=False`. Another small difference -is that we add epsilon in the denomator outside of the square root in the computation +is that we add epsilon in the denominator outside of the square root in the computation of batch norm. [0] https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ @@ -72,7 +72,7 @@ def backward(ctx, grad_out): return convolution_backward(grad_out, X, weight) ###################################################################### -# When testing with gradcheck, it is important to use double precision +# When testing with ``gradcheck``, it is important to use double precision weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double) X = torch.rand(10, 3, 7, 7, requires_grad=True, dtype=torch.double) torch.autograd.gradcheck(Conv2D.apply, (X, weight)) @@ -80,38 +80,38 @@ def backward(ctx, grad_out): ###################################################################### # Backward Formula Implementation for Batch Norm # ------------------------------------------------------------------- -# Batch Norm has two modes: training and eval mode. In training mode -# the sample statistics are a function of the inputs. In eval mode, +# Batch Norm has two modes: training and ``eval`` mode. In training mode +# the sample statistics are a function of the inputs. In ``eval`` mode, # we use the saved running statistics, which are not a function of the inputs. # This makes non-training mode's backward significantly simpler. Below # we implement and test only the training mode case. def unsqueeze_all(t): - # Helper function to unsqueeze all the dimensions that we reduce over + # Helper function to ``unsqueeze`` all the dimensions that we reduce over return t[None, :, None, None] def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps): - # We use the formula: out = (X - mean(X)) / (sqrt(var(X)) + eps) - # in batch norm 2d's forward. To simplify our derivation, we follow the + # We use the formula: ``out = (X - mean(X)) / (sqrt(var(X)) + eps)`` + # in batch norm 2D forward. To simplify our derivation, we follow the # chain rule and compute the gradients as follows before accumulating # them all into a final grad_input. - # 1) 'grad of out wrt var(X)' * 'grad of var(X) wrt X' - # 2) 'grad of out wrt mean(X)' * 'grad of mean(X) wrt X' - # 3) 'grad of out wrt X in the numerator' * 'grad of X wrt X' + # 1) ``grad of out wrt var(X)`` * ``grad of var(X) wrt X`` + # 2) ``grad of out wrt mean(X)`` * ``grad of mean(X) wrt X`` + # 3) ``grad of out wrt X in the numerator`` * ``grad of X wrt X`` # We then rewrite the formulas to use as few extra buffers as possible tmp = ((X - unsqueeze_all(sum) / N) * grad_out).sum(dim=(0, 2, 3)) tmp *= -1 - d_denom = tmp / (sqrt_var + eps)**2 # d_denom = -num / denom**2 - # It is useful to delete tensors when you no longer need them with `del` - # For example, we could've done `del tmp` here because we won't use it later - # In this case, it's not a big difference because tmp only has size of (C,) + d_denom = tmp / (sqrt_var + eps)**2 # ``d_denom = -num / denom**2`` + # It is useful to delete tensors when you no longer need them with ``del`` + # For example, we could've done ``del tmp`` here because we won't use it later + # In this case, it's not a big difference because ``tmp`` only has size of (C,) # The important thing is avoid allocating NCHW-sized tensors unnecessarily - d_var = d_denom / (2 * sqrt_var) # denom = torch.sqrt(var) + eps - # Compute d_mean_dx before allocating the final NCHW-sized grad_input buffer + d_var = d_denom / (2 * sqrt_var) # ``denom = torch.sqrt(var) + eps`` + # Compute ``d_mean_dx`` before allocating the final NCHW-sized grad_input buffer d_mean_dx = grad_out / unsqueeze_all(sqrt_var + eps) d_mean_dx = unsqueeze_all(-d_mean_dx.sum(dim=(0, 2, 3)) / N) - # d_mean_dx has already been reassigned to a C-sized buffer so no need to worry + # ``d_mean_dx`` has already been reassigned to a C-sized buffer so no need to worry - # (1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1) + # ``(1) unbiased_var(x) = ((X - unsqueeze_all(mean))**2).sum(dim=(0, 2, 3)) / (N - 1)`` grad_input = X * unsqueeze_all(d_var * N) grad_input += unsqueeze_all(-d_var * sum) grad_input *= 2 / ((N - 1) * N) @@ -120,13 +120,13 @@ def batch_norm_backward(grad_out, X, sum, sqrt_var, N, eps): # (3) Add 'grad_out / ' without allocating an extra buffer grad_input *= unsqueeze_all(sqrt_var + eps) grad_input += grad_out - grad_input /= unsqueeze_all(sqrt_var + eps) # sqrt_var + eps > 0! + grad_input /= unsqueeze_all(sqrt_var + eps) # ``sqrt_var + eps > 0!`` return grad_input class BatchNorm(torch.autograd.Function): @staticmethod def forward(ctx, X, eps=1e-3): - # Don't save keepdim'd values for backward + # Don't save ``keepdim`` values for backward sum = X.sum(dim=(0, 2, 3)) var = X.var(unbiased=True, dim=(0, 2, 3)) N = X.numel() / X.size(1) @@ -149,7 +149,7 @@ def backward(ctx, grad_out): return batch_norm_backward(grad_out, X, ctx.sum, ctx.sqrt_var, ctx.N, ctx.eps) ###################################################################### -# Testing with gradcheck +# Testing with ``gradcheck`` a = torch.rand(1, 2, 3, 4, requires_grad=True, dtype=torch.double) torch.autograd.gradcheck(BatchNorm.apply, (a,), fast_mode=False) @@ -228,7 +228,7 @@ def reset_parameters(self) -> None: nn.init.kaiming_uniform_(self.conv_weight, a=math.sqrt(5)) ###################################################################### -# Use gradcheck to validate the correctness of our backward formula +# Use ``gradcheck`` to validate the correctness of our backward formula weight = torch.rand(5, 3, 3, 3, requires_grad=True, dtype=torch.double) X = torch.rand(2, 3, 4, 4, requires_grad=True, dtype=torch.double) torch.autograd.gradcheck(FusedConvBN2DFunction.apply, (X, weight)) @@ -236,7 +236,7 @@ def reset_parameters(self) -> None: ###################################################################### # Testing out our new Layer # ------------------------------------------------------------------- -# Use FusedConvBN to train a basic network +# Use ``FusedConvBN`` to train a basic network # The code below is after some light modifications to the example here: # https://github.com/pytorch/examples/tree/master/mnist import torch.optim as optim @@ -350,20 +350,20 @@ def test(model, device, test_loader): ###################################################################### # A Comparison of Memory Usage # ------------------------------------------------------------------- -# If cuda is enabled, print out memory usage for both `fused=True` and `fused=False` -# For an example run on RTX 3070, CuDNN 8.0.5: fused peak memory: 1.56GB, +# If CUDA is enabled, print out memory usage for both `fused=True` and `fused=False` +# For an example run on NVIDIA GeForce RTX 3070, NVIDIA CUDA® Deep Neural Network library (cuDNN) 8.0.5: fused peak memory: 1.56GB, # unfused peak memory: 2.68GB # # It is important to note that the *peak* memory usage for this model may vary depending -# the specific CuDNN convolution algorithm used. For shallower models, it +# the specific cuDNN convolution algorithm used. For shallower models, it # may be possible for the peak memory allocated of the fused model to exceed # that of the unfused model! This is because the memory allocated to compute -# certain CuDNN convolution algorithms can be high enough to "hide" the typical peak +# certain cuDNN convolution algorithms can be high enough to "hide" the typical peak # you would expect to be near the start of the backward pass. # # For this reason, we also record and display the memory allocated at the end # of the forward pass as an approximation, and to demonstrate that we indeed -# allocate one fewer buffer per fused conv-bn pair. +# allocate one fewer buffer per fused ``conv-bn`` pair. from statistics import mean torch.backends.cudnn.enabled = True @@ -384,7 +384,7 @@ def test(model, device, test_loader): scheduler.step() peak_memory_allocated.append(torch.cuda.max_memory_allocated()) torch.cuda.reset_peak_memory_stats() - print("CuDNN version:", torch.backends.cudnn.version()) + print("cuDNN version:", torch.backends.cudnn.version()) print() print("Peak memory allocated:") print(f"fused: {peak_memory_allocated[0]/1024**3:.2f}GB, unfused: {peak_memory_allocated[1]/1024**3:.2f}GB") diff --git a/intermediate_source/ensembling.py b/intermediate_source/ensembling.py index 8b3c21e4086..8102b7bc184 100644 --- a/intermediate_source/ensembling.py +++ b/intermediate_source/ensembling.py @@ -77,15 +77,15 @@ def forward(self, x): predictions2 = [model(minibatch) for model in models] ###################################################################### -# Using vmap to vectorize the ensemble +# Using ``vmap`` to vectorize the ensemble # ------------------------------------ # -# Let's use vmap to speed up the for-loop. We must first prepare the models -# for use with vmap. +# Let's use ``vmap`` to speed up the for-loop. We must first prepare the models +# for use with ``vmap``. # # First, let’s combine the states of the model together by stacking each # parameter. For example, ``model[i].fc1.weight`` has shape ``[784, 128]``; we are -# going to stack the .fc1.weight of each of the 10 models to produce a big +# going to stack the ``.fc1.weight`` of each of the 10 models to produce a big # weight of shape ``[10, 784, 128]``. # # PyTorch offers the ``torch.func.stack_module_state`` convenience function to do @@ -95,7 +95,7 @@ def forward(self, x): params, buffers = stack_module_state(models) ###################################################################### -# Next, we need to define a function to vmap over. The function should, +# Next, we need to define a function to ``vmap`` over. The function should, # given parameters and buffers and inputs, run the model using those # parameters, buffers, and inputs. We'll use ``torch.func.functional_call`` # to help out: @@ -114,9 +114,9 @@ def fmodel(params, buffers, x): ###################################################################### # Option 1: get predictions using a different minibatch for each model. # -# By default, vmap maps a function across the first dimension of all inputs to +# By default, ``vmap`` maps a function across the first dimension of all inputs to # the passed-in function. After using ``stack_module_state``, each of -# the params and buffers have an additional dimension of size 'num_models' at +# the ``params`` and buffers have an additional dimension of size 'num_models' at # the front, and minibatches has a dimension of size 'num_models'. print([p.size(0) for p in params.values()]) # show the leading 'num_models' dimension @@ -127,14 +127,14 @@ def fmodel(params, buffers, x): predictions1_vmap = vmap(fmodel)(params, buffers, minibatches) -# verify the vmap predictions match the +# verify the ``vmap`` predictions match the assert torch.allclose(predictions1_vmap, torch.stack(predictions_diff_minibatch_loop), atol=1e-3, rtol=1e-5) ###################################################################### # Option 2: get predictions using the same minibatch of data. # -# vmap has an in_dims arg that specifies which dimensions to map over. -# By using ``None``, we tell vmap we want the same minibatch to apply for all of +# ``vmap`` has an ``in_dims`` argument that specifies which dimensions to map over. +# By using ``None``, we tell ``vmap`` we want the same minibatch to apply for all of # the 10 models. predictions2_vmap = vmap(fmodel, in_dims=(0, 0, None))(params, buffers, minibatch) @@ -143,9 +143,9 @@ def fmodel(params, buffers, x): ###################################################################### # A quick note: there are limitations around what types of functions can be -# transformed by vmap. The best functions to transform are ones that are pure +# transformed by ``vmap``. The best functions to transform are ones that are pure # functions: a function where the outputs are only determined by the inputs -# that have no side effects (e.g. mutation). vmap is unable to handle mutation +# that have no side effects (e.g. mutation). ``vmap`` is unable to handle mutation # of arbitrary Python data structures, but it is able to handle many in-place # PyTorch operations. @@ -165,11 +165,11 @@ def fmodel(params, buffers, x): print(f'Predictions with vmap {with_vmap.timeit(100)}') ###################################################################### -# There's a large speedup using vmap! +# There's a large speedup using ``vmap``! # -# In general, vectorization with vmap should be faster than running a function +# In general, vectorization with ``vmap`` should be faster than running a function # in a for-loop and competitive with manual batching. There are some exceptions -# though, like if we haven’t implemented the vmap rule for a particular +# though, like if we haven’t implemented the ``vmap`` rule for a particular # operation or if the underlying kernels weren’t optimized for older hardware # (GPUs). If you see any of these cases, please let us know by opening an issue # on GitHub. diff --git a/intermediate_source/flask_rest_api_tutorial.py b/intermediate_source/flask_rest_api_tutorial.py index 39c1a9d39f7..690fa975a5c 100644 --- a/intermediate_source/flask_rest_api_tutorial.py +++ b/intermediate_source/flask_rest_api_tutorial.py @@ -53,7 +53,7 @@ # Simple Web Server # ----------------- # -# Following is a simple webserver, taken from Flask's documentation +# Following is a simple web server, taken from Flask's documentation from flask import Flask @@ -114,7 +114,7 @@ def predict(): # ~~~~~~~~~~~~~~~~~~~ # # DenseNet model requires the image to be of 3 channel RGB image of size -# 224 x 224. We will also normalise the image tensor with the required mean +# 224 x 224. We will also normalize the image tensor with the required mean # and standard deviation values. You can read more about it # `here `_. # diff --git a/intermediate_source/forward_ad_usage.py b/intermediate_source/forward_ad_usage.py index ef194d65023..10965d64ab9 100644 --- a/intermediate_source/forward_ad_usage.py +++ b/intermediate_source/forward_ad_usage.py @@ -25,7 +25,7 @@ to dual numbers[0]. As the forward pass is performed, if any input tensors are dual tensors, -extra computation is performed to propogate this "sensitivity" of the +extra computation is performed to propagate this "sensitivity" of the function. """ @@ -68,7 +68,7 @@ def fn(x, y): plain_tensor = torch.randn(10, 10) dual_output = fn(dual_input, plain_tensor) - # Unpacking the dual returns a namedtuple with ``primal`` and ``tangent`` + # Unpacking the dual returns a ``namedtuple`` with ``primal`` and ``tangent`` # as attributes jvp = fwAD.unpack_dual(dual_output).tangent @@ -136,7 +136,7 @@ class Fn(torch.autograd.Function): @staticmethod def forward(ctx, foo): result = torch.exp(foo) - # Tensors stored in ctx can be used in the subsequent forward grad + # Tensors stored in ``ctx`` can be used in the subsequent forward grad # computation. ctx.result = result return result @@ -144,7 +144,7 @@ def forward(ctx, foo): @staticmethod def jvp(ctx, gI): gO = gI * ctx.result - # If the tensor stored in ctx will not also be used in the backward pass, + # If the tensor stored in`` ctx`` will not also be used in the backward pass, # one can manually free it using ``del`` del ctx.result return gO @@ -161,9 +161,9 @@ def jvp(ctx, gI): # It is important to use ``autograd.gradcheck`` to verify that your # custom autograd Function computes the gradients correctly. By default, -# gradcheck only checks the backward-mode (reverse-mode) AD gradients. Specify +# ``gradcheck`` only checks the backward-mode (reverse-mode) AD gradients. Specify # ``check_forward_ad=True`` to also check forward grads. If you did not -# implement the backward formula for your function, you can also tell gradcheck +# implement the backward formula for your function, you can also tell ``gradcheck`` # to skip the tests that require backward-mode AD by specifying # ``check_backward_ad=False``, ``check_undefined_grad=False``, and # ``check_batched_grad=False``. @@ -198,11 +198,11 @@ def fn(x, y): return x ** 2 + y ** 2 # Here is a basic example to compute the JVP of the above function. -# The jvp(func, primals, tangents) returns func(*primals) as well as the -# computed jvp. Each primal must be associated with a tangent of the same shape. +# The ``jvp(func, primals, tangents)`` returns ``func(*primals)`` as well as the +# computed Jacobian-vector product (JVP). Each primal must be associated with a tangent of the same shape. primal_out, tangent_out = ft.jvp(fn, (primal0, primal1), (tangent0, tangent1)) -# functorch.jvp requires every primal to be associated with a tangent. +# ``functorch.jvp`` requires every primal to be associated with a tangent. # If we only want to associate certain inputs to `fn` with tangents, # then we'll need to create a new function that captures inputs without tangents: primal = torch.randn(10, 10) @@ -216,7 +216,7 @@ def fn(x, y): ###################################################################### # Using the functional API with Modules # -------------------------------------------------------------------- -# To use ``nn.Module`` with functorch.jvp to compute Jacobian-vector products +# To use ``nn.Module`` with ``functorch.jvp`` to compute Jacobian-vector products # with respect to the model parameters, we need to reformulate the # ``nn.Module`` as a function that accepts both the model parameters and inputs # to the module. @@ -225,16 +225,16 @@ def fn(x, y): input = torch.randn(16, 5) tangents = tuple([torch.rand_like(p) for p in model.parameters()]) -# Given a torch.nn.Module, ft.make_functional_with_buffers extracts the state -# (params and buffers) and returns a functional version of the model that +# Given a ``torch.nn.Module``, ``ft.make_functional_with_buffers`` extracts the state +# (``params`` and buffers) and returns a functional version of the model that # can be invoked like a function. # That is, the returned ``func`` can be invoked like # ``func(params, buffers, input)``. -# ft.make_functional_with_buffers is analogous to the nn.Modules stateless API +# ``ft.make_functional_with_buffers`` is analogous to the ``nn.Modules`` stateless API # that you saw previously and we're working on consolidating the two. func, params, buffers = ft.make_functional_with_buffers(model) -# Because jvp requires every input to be associated with a tangent, we need to +# Because ``jvp`` requires every input to be associated with a tangent, we need to # create a new function that, when given the parameters, produces the output def func_params_only(params): return func(params, buffers, input) diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py index c06f5f76835..90620ceba4e 100644 --- a/intermediate_source/fx_conv_bn_fuser.py +++ b/intermediate_source/fx_conv_bn_fuser.py @@ -32,7 +32,7 @@ # For this tutorial, we are going to create a model consisting of convolutions # and batch norms. Note that this model has some tricky components - some of # the conv/batch norm patterns are hidden within Sequentials and one of the -# BatchNorms is wrapped in another Module. +# ``BatchNorms`` is wrapped in another Module. class WrappedBatchNorm(nn.Module): def __init__(self): @@ -137,7 +137,7 @@ def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): def _parent_name(target : str) -> Tuple[str, str]: """ - Splits a qualname into parent path and last atom. + Splits a ``qualname`` into parent path and last atom. For example, `foo.bar.baz` -> (`foo.bar`, `baz`) """ *parent, name = target.rsplit('.', 1) @@ -242,9 +242,9 @@ def benchmark(model, iters=20): print("Fused time: ", benchmark(fused_rn18)) ###################################################################### # As we previously saw, the output of our FX transformation is -# (Torchscriptable) PyTorch code, we can easily `jit.script` the output to try +# ("torchscriptable") PyTorch code, we can easily ``jit.script`` the output to try # and increase our performance even more. In this way, our FX model -# transformation composes with Torchscript with no issues. +# transformation composes with TorchScript with no issues. jit_rn18 = torch.jit.script(fused_rn18) print("jit time: ", benchmark(jit_rn18)) diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py index 06726e4dd6c..18d8bc67cf4 100644 --- a/intermediate_source/fx_profiling_tutorial.py +++ b/intermediate_source/fx_profiling_tutorial.py @@ -117,7 +117,7 @@ def __init__(self, mod : torch.nn.Module): ###################################################################### # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run`` - # method is the top-level entrypoint for execution of the model. We will + # method is the top-level entry point for execution of the model. We will # want to intercept this so that we can record the total runtime of the # model. @@ -129,7 +129,7 @@ def run(self, *args) -> Any: # Record the time we finished running the model t_end = time.time() # Store the total elapsed time this model execution took in the - # ProfilingInterpreter + # ``ProfilingInterpreter`` self.total_runtime_sec.append(t_end - t_start) return return_val @@ -176,7 +176,7 @@ def summary(self, should_sort : bool = False) -> str: # time each node took with respect to the whole network. pct_total = mean_runtime / mean_total_runtime * 100 # Record the node's type, name of the node, mean runtime, and - # percent runtim + # percent runtime. node_summaries.append( [node.op, str(node), mean_runtime, pct_total]) @@ -214,7 +214,7 @@ def summary(self, should_sort : bool = False) -> str: ###################################################################### # There are two things we should call out here: # -# * MaxPool2d takes up the most time. This is a known issue: +# * ``MaxPool2d`` takes up the most time. This is a known issue: # https://github.com/pytorch/pytorch/issues/51393 # * BatchNorm2d also takes up significant time. We can continue this # line of thinking and optimize this in the Conv-BN Fusion with FX @@ -226,7 +226,7 @@ def summary(self, should_sort : bool = False) -> str: # As we can see, using FX we can easily capture PyTorch programs (even # ones we don't have the source code for!) in a machine-interpretable # format and use that for analysis, such as the performance analysis -# we've done here. FX opens up an exiciting world of possibilities for +# we've done here. FX opens up an exciting world of possibilities for # working with PyTorch programs. # # Finally, since FX is still in beta, we would be happy to hear any diff --git a/intermediate_source/jacobians_hessians.py b/intermediate_source/jacobians_hessians.py index 3da8bda11f1..b8b96c30a3e 100644 --- a/intermediate_source/jacobians_hessians.py +++ b/intermediate_source/jacobians_hessians.py @@ -62,7 +62,7 @@ def compute_jac(xp): ###################################################################### # Instead of computing the jacobian row-by-row, we can use PyTorch's # ``torch.vmap`` function transform to get rid of the for-loop and vectorize the -# computation. We can’t directly apply vmap to ``torch.autograd.grad``; +# computation. We can’t directly apply ``vmap`` to ``torch.autograd.grad``; # instead, PyTorch provides a ``torch.func.vjp`` transform that composes with # ``torch.vmap``: @@ -76,15 +76,15 @@ def compute_jac(xp): assert torch.allclose(ft_jacobian, jacobian) ###################################################################### -# In a later tutorial a composition of reverse-mode AD and vmap will give us +# In a later tutorial a composition of reverse-mode AD and ``vmap`` will give us # per-sample-gradients. -# In this tutorial, composing reverse-mode AD and vmap gives us Jacobian +# In this tutorial, composing reverse-mode AD and ``vmap`` gives us Jacobian # computation! -# Various compositions of vmap and autodiff transforms can give us different +# Various compositions of ``vmap`` and autodiff transforms can give us different # interesting quantities. # # PyTorch provides ``torch.func.jacrev`` as a convenience function that performs -# the vmap-vjp composition to compute jacobians. ``jacrev`` accepts an argnums +# the ``vmap-vjp`` composition to compute jacobians. ``jacrev`` accepts an ``argnums`` # argument that says which argument we would like to compute Jacobians with # respect to. @@ -92,7 +92,7 @@ def compute_jac(xp): ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x) -# confirm +# Confirm by running the following: assert torch.allclose(ft_jacobian, jacobian) ###################################################################### @@ -100,10 +100,10 @@ def compute_jac(xp): # The function transform version is much faster (and becomes even faster the # more outputs there are). # -# In general, we expect that vectorization via vmap can help eliminate overhead +# In general, we expect that vectorization via ``vmap`` can help eliminate overhead # and give better utilization of your hardware. # -# vmap does this magic by pushing the outer loop down into the function's +# ``vmap`` does this magic by pushing the outer loop down into the function's # primitive operations in order to obtain better performance. # # Let's make a quick function to evaluate performance and deal with @@ -133,34 +133,34 @@ def get_perf(first, first_descriptor, second, second_descriptor): print(with_vmap_timer) ###################################################################### -# Let's do a relative performance comparison of the above with our get_perf function: +# Let's do a relative performance comparison of the above with our ``get_perf`` function: get_perf(no_vmap_timer, "without vmap", with_vmap_timer, "vmap") ###################################################################### -# Furthemore, it’s pretty easy to flip the problem around and say we want to +# Furthermore, it’s pretty easy to flip the problem around and say we want to # compute Jacobians of the parameters to our model (weight, bias) instead of the input -# note the change in input via argnums params of 0,1 to map to weight and bias +# note the change in input via ``argnums`` parameters of 0,1 to map to weight and bias ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x) ###################################################################### -# reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd) -# -------------------------------------------------------------------- +# Reverse-mode Jacobian (``jacrev``) vs forward-mode Jacobian (``jacfwd``) +# ------------------------------------------------------------------------ # # We offer two APIs to compute jacobians: ``jacrev`` and ``jacfwd``: # -# - jacrev uses reverse-mode AD. As you saw above it is a composition of our -# vjp and vmap transforms. -# - jacfwd uses forward-mode AD. It is implemented as a composition of our -# jvp and vmap transforms. +# - ``jacrev`` uses reverse-mode AD. As you saw above it is a composition of our +# ``vjp`` and ``vmap`` transforms. +# - ``jacfwd`` uses forward-mode AD. It is implemented as a composition of our +# ``jvp`` and ``vmap`` transforms. # -# jacfwd and jacrev can be substituted for each other but they have different +# ``jacfwd`` and ``jacrev`` can be substituted for each other but they have different # performance characteristics. # # As a general rule of thumb, if you’re computing the jacobian of an :math:`R^N \to R^M` -# function, and there are many more outputs than inputs (i.e. :math:`M > N`) then -# jacfwd is preferred, otherwise use jacrev. There are exceptions to this rule, +# function, and there are many more outputs than inputs (for example, :math:`M > N`) then +# ``jacfwd`` is preferred, otherwise use ``jacrev``. There are exceptions to this rule, # but a non-rigorous argument for this follows: # # In reverse-mode AD, we are computing the jacobian row-by-row, while in @@ -217,7 +217,7 @@ def get_perf(first, first_descriptor, second, second_descriptor): print(f'jacrev time: {jacrev_timing}') ####################################################################### -# and a relative perf comparison: +# and a relative performance comparison: get_perf(jacrev_timing, "jacrev", jacfwd_timing, "jacfwd") @@ -228,7 +228,7 @@ def get_perf(first, first_descriptor, second, second_descriptor): # Hessians are the jacobian of the jacobian (or the partial derivative of # the partial derivative, aka second order). # -# This suggests that one can just compose functorch’s jacobian transforms to +# This suggests that one can just compose functorch jacobian transforms to # compute the Hessian. # Indeed, under the hood, ``hessian(f)`` is simply ``jacfwd(jacrev(f))``. # @@ -238,7 +238,7 @@ def get_perf(first, first_descriptor, second, second_descriptor): from torch.func import hessian -# lets reduce the size in order not to blow out colab. Hessians require +# lets reduce the size in order not to overwhelm Colab. Hessians require # significant memory: Din = 512 Dout = 32 @@ -251,8 +251,8 @@ def get_perf(first, first_descriptor, second, second_descriptor): hess_revrev = jacrev(jacrev(predict, argnums=2), argnums=2)(weight, bias, x) ####################################################################### -# Let's verify we have the same result regardless of using hessian api or -# using jacfwd(jacfwd()) +# Let's verify we have the same result regardless of using hessian API or +# using ``jacfwd(jacfwd())``. torch.allclose(hess_api, hess_fwdfwd) @@ -265,7 +265,7 @@ def get_perf(first, first_descriptor, second, second_descriptor): # shape ``(B, N)`` and a function that goes from :math:`R^N \to R^M`, we would like # a Jacobian of shape ``(B, M, N)``. # -# The easiest way to do this is to use vmap: +# The easiest way to do this is to use ``vmap``: batch_size = 64 Din = 31 @@ -284,7 +284,7 @@ def get_perf(first, first_descriptor, second, second_descriptor): ####################################################################### # If you have a function that goes from (B, N) -> (B, M) instead and are # certain that each input produces an independent output, then it's also -# sometimes possible to do this without using vmap by summing the outputs +# sometimes possible to do this without using ``vmap`` by summing the outputs # and then computing the Jacobian of that function: def predict_with_output_summed(weight, bias, x): @@ -295,10 +295,10 @@ def predict_with_output_summed(weight, bias, x): ####################################################################### # If you instead have a function that goes from :math:`R^N \to R^M` but inputs that -# are batched, you compose vmap with jacrev to compute batched jacobians: +# are batched, you compose ``vmap`` with ``jacrev`` to compute batched jacobians: # # Finally, batch hessians can be computed similarly. It's easiest to think -# about them by using vmap to batch over hessian computation, but in some +# about them by using ``vmap`` to batch over hessian computation, but in some # cases the sum trick also works. compute_batch_hessian = vmap(hessian(predict, argnums=2), in_dims=(None, None, 0)) diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index 4445704cd1b..21a5635c21b 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -3,10 +3,7 @@ Train a Mario-playing RL Agent ================ -Authors: `Yuansong Feng `__, `Suraj -Subramanian `__, `Howard -Wang `__, `Steven -Guo `__. +**Authors:** `Yuansong Feng `__, `Suraj Subramanian `__, `Howard Wang `__, `Steven Guo `__. This tutorial walks you through the fundamentals of Deep Reinforcement @@ -308,9 +305,9 @@ def act(self, state): Given a state, choose an epsilon-greedy action and update value of step. Inputs: - state(LazyFrame): A single observation of the current state, dimension is (state_dim) + state(``LazyFrame``): A single observation of the current state, dimension is (state_dim) Outputs: - action_idx (int): An integer representing which action Mario will perform + ``action_idx`` (``int``): An integer representing which action Mario will perform """ # EXPLORE if np.random.rand() < self.exploration_rate: @@ -359,11 +356,11 @@ def cache(self, state, next_state, action, reward, done): Store the experience to self.memory (replay buffer) Inputs: - state (LazyFrame), - next_state (LazyFrame), - action (int), - reward (float), - done(bool)) + state (``LazyFrame``), + next_state (``LazyFrame``), + action (``int``), + reward (``float``), + done(``bool``)) """ def first_if_tuple(x): return x[0] if isinstance(x, tuple) else x @@ -408,7 +405,7 @@ def recall(self): class MarioNet(nn.Module): - """mini cnn structure + """mini CNN structure input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output """ diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py index af2842bf333..f08980265de 100644 --- a/intermediate_source/memory_format_tutorial.py +++ b/intermediate_source/memory_format_tutorial.py @@ -40,7 +40,7 @@ N, C, H, W = 10, 3, 32, 32 x = torch.empty(N, C, H, W) -print(x.stride()) # Ouputs: (3072, 1024, 32, 1) +print(x.stride()) # Outputs: (3072, 1024, 32, 1) ###################################################################### # Conversion operator @@ -56,11 +56,11 @@ ###################################################################### # Alternative option x = x.contiguous(memory_format=torch.channels_last) -print(x.stride()) # Ouputs: (3072, 1, 96, 3) +print(x.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### # Format checks -print(x.is_contiguous(memory_format=torch.channels_last)) # Ouputs: True +print(x.is_contiguous(memory_format=torch.channels_last)) # Outputs: True ###################################################################### # There are minor difference between the two APIs ``to`` and @@ -82,8 +82,8 @@ # sizes are 1 in order to properly represent the intended memory # format special_x = torch.empty(4, 1, 4, 4) -print(special_x.is_contiguous(memory_format=torch.channels_last)) # Ouputs: True -print(special_x.is_contiguous(memory_format=torch.contiguous_format)) # Ouputs: True +print(special_x.is_contiguous(memory_format=torch.channels_last)) # Outputs: True +print(special_x.is_contiguous(memory_format=torch.contiguous_format)) # Outputs: True ###################################################################### # Same thing applies to explicit permutation API ``permute``. In @@ -100,36 +100,36 @@ ###################################################################### # Create as channels last x = torch.empty(N, C, H, W, memory_format=torch.channels_last) -print(x.stride()) # Ouputs: (3072, 1, 96, 3) +print(x.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### # ``clone`` preserves memory format y = x.clone() -print(y.stride()) # Ouputs: (3072, 1, 96, 3) +print(y.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### # ``to``, ``cuda``, ``float`` ... preserves memory format if torch.cuda.is_available(): y = x.cuda() - print(y.stride()) # Ouputs: (3072, 1, 96, 3) + print(y.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### # ``empty_like``, ``*_like`` operators preserves memory format y = torch.empty_like(x) -print(y.stride()) # Ouputs: (3072, 1, 96, 3) +print(y.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### # Pointwise operators preserves memory format z = x + y -print(z.stride()) # Ouputs: (3072, 1, 96, 3) +print(z.stride()) # Outputs: (3072, 1, 96, 3) ###################################################################### -# Conv, Batchnorm modules using cudnn backends support channels last -# (only works for CudNN >= 7.6). Convolution modules, unlike binary +# ``Conv``, ``Batchnorm`` modules using ``cudnn`` backends support channels last +# (only works for cuDNN >= 7.6). Convolution modules, unlike binary # p-wise operator, have channels last as the dominating memory format. -# IFF all inputs are in contiguous memory format, the operator -# produces output in contiguous memory format. Otherwise, output wil -# be in channels last memroy format. +# If all inputs are in contiguous memory format, the operator +# produces output in contiguous memory format. Otherwise, output will +# be in channels last memory format. if torch.backends.cudnn.version() >= 7603: model = torch.nn.Conv2d(8, 4, 3).cuda().half() @@ -139,7 +139,7 @@ input = input.to(device="cuda", memory_format=torch.channels_last, dtype=torch.float16) out = model(input) - print(out.is_contiguous(memory_format=torch.channels_last)) # Ouputs: True + print(out.is_contiguous(memory_format=torch.channels_last)) # Outputs: True ###################################################################### # When input tensor reaches a operator without channels last support, @@ -152,13 +152,13 @@ # Performance Gains # -------------------------------------------------------------------- # Channels last memory format optimizations are available on both GPU and CPU. -# On GPU, the most significant performance gains are observed on NVidia's +# On GPU, the most significant performance gains are observed on NVIDIA's # hardware with Tensor Cores support running on reduced precision # (``torch.float16``). -# We were able to archive over 22% perf gains with channels last +# We were able to archive over 22% performance gains with channels last # comparing to contiguous format, both while utilizing # 'AMP (Automated Mixed Precision)' training scripts. -# Our scripts uses AMP supplied by NVidia +# Our scripts uses AMP supplied by NVIDIA # https://github.com/NVIDIA/apex. # # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 ./data`` @@ -196,7 +196,7 @@ # Epoch: [0][80/125] Time 0.260 (0.335) Speed 770.324 (597.659) Loss 2.2505953312 (1.0879) Prec@1 50.500 (52.938) Prec@5 100.000 (100.000) ###################################################################### -# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% perf gain. +# Passing ``--channels-last true`` allows running a model in Channels last format with observed 22% performance gain. # # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data`` @@ -237,12 +237,12 @@ # Epoch: [0][80/125] Time 0.198 (0.269) Speed 1011.827 (743.883) Loss 2.8196096420 (2.4011) Prec@1 47.500 (50.938) Prec@5 100.000 (100.000) ###################################################################### -# The following list of models has the full support of Channels last and showing 8%-35% perf gains on Volta devices: +# The following list of models has the full support of Channels last and showing 8%-35% performance gains on Volta devices: # ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2`` # ###################################################################### -# The following list of models has the full support of Channels last and showing 26%-76% perf gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs: +# The following list of models has the full support of Channels last and showing 26%-76% performance gains on Intel(R) Xeon(R) Ice Lake (or newer) CPUs: # ``alexnet``, ``densenet121``, ``densenet161``, ``densenet169``, ``googlenet``, ``inception_v3``, ``mnasnet0_5``, ``mnasnet1_0``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext101_32x8d``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2`` # @@ -381,7 +381,7 @@ def attribute(m): # ---------- # There are still many things to do, such as: # -# - Resolving ambiguity of N1HW and NC11 Tensors; +# - Resolving ambiguity of ``N1HW`` and ``NC11`` Tensors; # - Testing of Distributed Training support; # - Improving operators coverage. # diff --git a/intermediate_source/mnist_train_nas.py b/intermediate_source/mnist_train_nas.py index e3141e3d958..4ae6d894fce 100644 --- a/intermediate_source/mnist_train_nas.py +++ b/intermediate_source/mnist_train_nas.py @@ -1,5 +1,5 @@ """ -Example training code for ax_multiobjective_nas_tutorial.py +Example training code for ``ax_multiobjective_nas_tutorial.py`` """ import argparse diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py index 7e5cb84c499..d7a4da73371 100644 --- a/intermediate_source/model_parallel_tutorial.py +++ b/intermediate_source/model_parallel_tutorial.py @@ -259,11 +259,11 @@ def forward(self, x): ret = [] for s_next in splits: - # A. s_prev runs on cuda:1 + # A. ``s_prev`` runs on ``cuda:1`` s_prev = self.seq2(s_prev) ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - # B. s_next runs on cuda:0, which can run concurrently with A + # B. ``s_next`` runs on ``cuda:0``, which can run concurrently with A s_prev = self.seq1(s_next).to('cuda:1') s_prev = self.seq2(s_prev) @@ -339,7 +339,7 @@ def forward(self, x): # still opportunities to further accelerate the training process. For example, # all operations on ``cuda:0`` is placed on its default stream. It means that # computations on the next split cannot overlap with the copy operation of the -# prev split. However, as prev and next splits are different tensors, there is +# ``prev`` split. However, as ``prev`` and next splits are different tensors, there is # no problem to overlap one's computation with the other one's copy. The # implementation need to use multiple streams on both GPUs, and different # sub-network structures require different stream management strategies. As no From 6cae5aeec15bae65d6c05dbd32a3061f38153a85 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 18 Apr 2023 15:58:10 -0700 Subject: [PATCH 2/3] Update mario_rl_tutorial.py --- intermediate_source/mario_rl_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index 21a5635c21b..ff653d54c11 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -396,7 +396,7 @@ def recall(self): # In our implementation, we share feature generator ``features`` across # :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC # classifiers for each. :math:`\theta_{target}` (the parameters of -# :math:`Q_{target}`) is frozen to prevent updation by backprop. Instead, +# :math:`Q_{target}`) is frozen to prevent updating by backprop. Instead, # it is periodically synced with :math:`\theta_{online}` (more on this # later). # From 7f7e62e8a34581115c7bbc8f5dca55fb3dea57ff Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 18 Apr 2023 15:58:31 -0700 Subject: [PATCH 3/3] Update en-wordlist.txt --- en-wordlist.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/en-wordlist.txt b/en-wordlist.txt index 7fd34f1ee56..9a4a99d1df2 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -278,7 +278,6 @@ unfused unimodal unnormalized unpickling -updation utils vectorization vectorize