diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5697f123d49..73d6353f673 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -562,10 +562,11 @@ workflows:
             branches:
               only:
                 - master
-      - pytorch_windows_build_worker:
-          name: win_test_worker
-          filters:
-            branches:
-              only:
-                - master
+#      - pytorch_windows_build_worker:
+#          name: win_test_worker
+#          type: approval
+#          filters:
+#            branches:
+#              only:
+#                - master
 
diff --git a/.jenkins/remove_runnable_code.py b/.jenkins/remove_runnable_code.py
index 6a61cb656bc..bd62f0c5156 100644
--- a/.jenkins/remove_runnable_code.py
+++ b/.jenkins/remove_runnable_code.py
@@ -16,9 +16,17 @@
             if line.startswith('#'):
                 ret_lines.append(line)
                 state = STATE_NORMAL
+            elif ((line.startswith('"""') or line.startswith('r"""')) and
+                    line.endswith('"""')):
+                ret_lines.append(line)
+                state = STATE_NORMAL
             elif line.startswith('"""') or line.startswith('r"""'):
                 ret_lines.append(line)
                 state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
+            elif ((line.startswith("'''") or line.startswith("r'''")) and
+                    line.endswith("'''")):
+                ret_lines.append(line)
+                state = STATE_NORMAL
             elif line.startswith("'''") or line.startswith("r'''"):
                 ret_lines.append(line)
                 state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
diff --git a/_static/img/ray-tune.png b/_static/img/ray-tune.png
new file mode 100644
index 00000000000..febd6de282e
Binary files /dev/null and b/_static/img/ray-tune.png differ
diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png
new file mode 100644
index 00000000000..a6916ce5605
Binary files /dev/null and b/_static/img/thumbnails/cropped/amp.png differ
diff --git a/_static/img/thumbnails/cropped/profile.png b/_static/img/thumbnails/cropped/profile.png
new file mode 100644
index 00000000000..372db8bbe87
Binary files /dev/null and b/_static/img/thumbnails/cropped/profile.png differ
diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 7a7d806c328..4f3b52fea32 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this:
     that provides implementations for all basic operators on the XLA dispatch
     key.
 
+.. _autograd-support:
+
 Adding autograd support
 -----------------------
 
@@ -229,38 +231,97 @@ Autocast
 ^^^^^^^^
 
 The Autocast dispatch key implements support for
-`automatic mixed precision <https://developer.nvidia.com/automatic-mixed-precision>`_
-(AMP).  An autocast kernel typically modifies the operation of an operator by casting the
-input arguments to some precision before carrying out the operation.  For some
-operations, it is numerically safe to cast to lower precision, which is how AMP
-can achieve speed ups and reduced memory usage without sacrificing much
-accuracy.  A nontrivial autocast kernel looks something like this:
+`automatic mixed precision (AMP) <https://pytorch.org/docs/stable/amp.html>`_.
+An autocast wrapper kernel typically casts incoming ``float16`` or ``float32`` CUDA tensors
+to some preferred precision before running the op.
+For example, matmuls and convolutions on floating-point CUDA tensors usually run faster
+and use less memory in ``float16`` without impairing convergence.
+Autocast wrappers only have an effect in
+`autocast-enabled contexts <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_.
+
+Here's an autocast wrapper for a hypothetical custom matmul, along with its registration:
 
 .. code-block:: cpp
 
+    // Autocast-specific helper functions
+    #include <ATen/autocast_mode.h>
+
     Tensor mymatmul_autocast(const Tensor& self, const Tensor& other) {
       c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
-      return mymatmul(autocast::_cast(at::kHalf, self), autocast::_cast(at::kHalf, other));
+      return mymatmul(at::autocast::cached_cast(at::kHalf, self),
+                      at::autocast::cached_cast(at::kHalf, other));
     }
 
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("mymatmul", mymatmul_autocast);
+    }
+
+``cached_cast(kHalf, tensor)`` casts ``tensor`` to ``float16`` if ``tensor`` is CUDA and ``float32``,
+otherwise, it leaves ``tensor`` unchanged (c.f. the
+`eligibility policy <https://pytorch.org/docs/stable/amp.html#op-eligibility>`_ for natively autocasted ops).
+This ensures if the network calls ``mymatmul`` on any mixture of ``float16`` and ``float32`` CUDA tensors,
+``mymatmul`` runs in ``float16``.  Meanwhile, calls to ``mymatmul`` with non-CUDA, integer-type, or ``float64``
+inputs are unaffected.  Using ``cached_cast`` to follow the native eligibility policy in your own autocast wrapper
+is recommended, but not required.  For example, if you wanted to force ``float16`` execution for all input types,
+you could ``return mymatmul(self.half(), other.half());`` instead of using ``cached_cast``.
+
 Notice that, like our autograd kernels, we exclude the ``Autocast`` key from
-dispatch before redispatching.  By default, if no autocast kernel is provided,
-we simply fallthrough directly to the regular operator implementation (no
-autocasting occurs.) (We didn't use ``myadd`` for this example, since pointwise
-addition doesn't do autocasting and should just fall through).
-
-When should an autocast kernel be registered? Unfortunately, there aren't
-cut-and-dry rules for when you should cast to a lower precision.  You can
-get a sense for what operators have autocasting behavior by looking at
-the `AMP documentation
-<https://pytorch.org/docs/master/amp.html#op-specific-behavior>`_.  Some other
-general rules:
-
-* Operations that do reductions should be carried out in float32,
-* Any operation with multiple float tensor inputs has to standardize them
-  to a common precision, and
-* Any operation that does a convolution or gemm under the hood should
-  probably be float16
+dispatch before redispatching.
+
+By default, if no autocast wrapper is provided,
+we fallthrough directly to the regular operator implementation (no
+autocasting occurs).  (We didn't use ``myadd`` for this example, since pointwise
+addition doesn't need autocasting and should just fall through.)
+
+When should an autocast wrapper be registered? Unfortunately, there aren't
+cut-and-dried rules for an op's preferred precision.  You can
+get a sense for some native ops' preferred precisions by looking at the
+`cast lists <https://pytorch.org/docs/master/amp.html#op-specific-behavior>`_.
+General guidance:
+
+* Ops that do reductions should probably execute in ``float32``,
+* Any op that does a convolution or gemm under the hood should
+  probably execute in ``float16``, and
+* Other ops with multiple floating-point tensor inputs should standardize
+  them to a common precision (unless the implementation supports inputs with different precisions).
+
+If your custom op falls into the third category, the ``promote_type`` template
+helps figure out the widest floating-point type present among input tensors, which is
+the safest choice for the execution type:
+
+.. code-block:: cpp
+
+    #include <ATen/autocast_mode.h>
+
+    Tensor my_multiple_input_op_autocast(const Tensor& t0, const Tensor& t1) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      // The required at::kHalf argument is an optimistic initial guess.
+      auto exec_type = at::autocast::promote_type(at::kHalf, t0, t1);
+      return my_multiple_input_op(at::autocast::cached_cast(exec_type, t0),
+                                  at::autocast::cached_cast(exec_type, t1));
+    }
+
+If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
+an autocast wrapper for the same name onto which the autograd wrapper is registered.
+For example, if you wanted an autocast wrapper for the ``myadd`` function shown
+in the autograd section, all you'd need is
+
+.. code-block:: cpp
+
+    Tensor myadd_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return myadd(at::autocast::cached_cast(<desired dtype>, self),
+                   at::autocast::cached_cast(<desired dtype>, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("myadd", myadd_autocast);
+    }
+
+There are no separate gymnastics to make the backward method autocast compatible.
+However, the backward method defined in your custom autograd function will run in the same
+dtype as autocast sets for the forward method, so you should choose a ``<desired dtype>``
+suitable for both your forward and backward methods.
 
 Batched
 ^^^^^^^
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index f0cc99ce081..bf812b8eb52 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -374,7 +374,7 @@ def __call__(self, sample):
 #
 
 dataloader = DataLoader(transformed_dataset, batch_size=4,
-                        shuffle=True, num_workers=4)
+                        shuffle=True, num_workers=0)
 
 
 # Helper function to show a batch
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index 1da4614ebb7..509bf545234 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -591,7 +591,7 @@ def forward(self, input):
         # Format batch
         real_cpu = data[0].to(device)
         b_size = real_cpu.size(0)
-        label = torch.full((b_size,), real_label, device=device)
+        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
         # Forward pass real batch through D
         output = netD(real_cpu).view(-1)
         # Calculate loss on all-real batch
diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst
index bc9f7fe6bf0..f2878fb3ba9 100644
--- a/beginner_source/dist_overview.rst
+++ b/beginner_source/dist_overview.rst
@@ -195,3 +195,13 @@ RPC Tutorials are listed below:
    `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
    decorator, which can help speed up inference and training. It uses similar
    RL and PS examples employed in the above tutorials 1 and 2.
+5. The `Combining Distributed DataParallel with Distributed RPC Framework <../advanced/rpc_ddp_tutorial.html>`__
+   tutorial demonstrates how to combine DDP with RPC to train a model using 
+   distributed data parallelism combined with distributed model parallelism.
+
+
+PyTorch Distributed Developers
+------------------------------
+
+If you'd like to contribute to PyTorch Distributed, please refer to our 
+`Developer Guide <https://github.com/pytorch/pytorch/blob/master/torch/distributed/CONTRIBUTING.md>`_.
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py
new file mode 100644
index 00000000000..11524618cba
--- /dev/null
+++ b/beginner_source/hyperparameter_tuning_tutorial.py
@@ -0,0 +1,462 @@
+# -*- coding: utf-8 -*-
+"""
+Hyperparameter tuning with Ray Tune
+===================================
+
+Hyperparameter tuning can make the difference between an average model and a highly
+accurate one. Often simple things like choosing a different learning rate or changing
+a network layer size can have a dramatic impact on your model performance.
+
+Fortunately, there are tools that help with finding the best combination of parameters.
+`Ray Tune <https://docs.ray.io/en/latest/tune.html>`_ is an industry standard tool for
+distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search
+algorithms, integrates with TensorBoard and other analysis libraries, and natively
+supports distributed training through `Ray's distributed machine learning engine
+<https://ray.io/>`_.
+
+In this tutorial, we will show you how to integrate Ray Tune into your PyTorch
+training workflow. We will extend `this tutorial from the PyTorch documentation
+<https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_ for training
+a CIFAR10 image classifier.
+
+As you will see, we only need to add some slight modifications. In particular, we
+need to
+
+1. wrap data loading and training in functions,
+2. make some network parameters configurable,
+3. add checkpointing (optional),
+4. and define the search space for the model tuning
+
+|
+
+To run this tutorial, please make sure the following packages are
+installed:
+
+-  ``ray[tune]``: Distributed hyperparameter tuning library
+-  ``torchvision``: For the data transformers
+
+Setup / Imports
+---------------
+Let's start with the imports:
+"""
+from functools import partial
+import numpy as np
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import random_split
+import torchvision
+import torchvision.transforms as transforms
+from ray import tune
+from ray.tune import CLIReporter
+from ray.tune.schedulers import ASHAScheduler
+
+######################################################################
+# Most of the imports are needed for building the PyTorch model. Only the last three
+# imports are for Ray Tune.
+#
+# Data loaders
+# ------------
+# We wrap the data loaders in their own function and pass a global data directory.
+# This way we can share a data directory between different trials.
+
+
+def load_data(data_dir="./data"):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform)
+
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform)
+
+    return trainset, testset
+
+######################################################################
+# Configurable neural network
+# ---------------------------
+# We can only tune those parameters that are configurable. In this example, we can specify
+# the layer sizes of the fully connected layers:
+
+
+class Net(nn.Module):
+    def __init__(self, l1=120, l2=84):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, l1)
+        self.fc2 = nn.Linear(l1, l2)
+        self.fc3 = nn.Linear(l2, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+######################################################################
+# The train function
+# ------------------
+# Now it gets interesting, because we introduce some changes to the example `from the PyTorch
+# documentation <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_.
+#
+# We wrap the training script in a function ``train_cifar(config, checkpoint_dir=None, data_dir=None)``.
+# As you can guess, the ``config`` parameter will receive the hyperparameters we would like to
+# train with. The ``checkpoint_dir`` parameter is used to restore checkpoints. The ``data_dir`` specifies
+# the directory where we load and store the data, so multiple runs can share the same data source.
+#
+# .. code-block:: python
+#
+#     net = Net(config["l1"], config["l2"])
+#
+#     if checkpoint_dir:
+#         model_state, optimizer_state = torch.load(
+#             os.path.join(checkpoint_dir, "checkpoint"))
+#         net.load_state_dict(model_state)
+#         optimizer.load_state_dict(optimizer_state)
+#
+# The learning rate of the optimizer is made configurable, too:
+#
+# .. code-block:: python
+#
+#     optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+#
+# We also split the training data into a training and validation subset. We thus train on
+# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes
+# with which we iterate through the training and test sets are configurable as well.
+#
+# Adding (multi) GPU support with DataParallel
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Image classification benefits largely from GPUs. Luckily, we can continue to use
+# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel``
+# to support data parallel training on multiple GPUs:
+#
+# .. code-block:: python
+#
+#     device = "cpu"
+#     if torch.cuda.is_available():
+#         device = "cuda:0"
+#         if torch.cuda.device_count() > 1:
+#             net = nn.DataParallel(net)
+#     net.to(device)
+#
+# By using a ``device`` variable we make sure that training also works when we have
+# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly,
+# like this:
+#
+# .. code-block:: python
+#
+#     for i, data in enumerate(trainloader, 0):
+#         inputs, labels = data
+#         inputs, labels = inputs.to(device), labels.to(device)
+#
+# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray
+# also supports `fractional GPUs <https://docs.ray.io/en/master/using-ray-with-gpus.html#fractional-gpus>`_
+# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back
+# to that later.
+#
+# Communicating with Ray Tune
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The most interesting part is the communication with Ray Tune:
+#
+# .. code-block:: python
+#
+#     with tune.checkpoint_dir(epoch) as checkpoint_dir:
+#         path = os.path.join(checkpoint_dir, "checkpoint")
+#         torch.save((net.state_dict(), optimizer.state_dict()), path)
+#
+#     tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
+#
+# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically,
+# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics
+# to decide which hyperparameter configuration lead to the best results. These metrics
+# can also be used to stop bad performing trials early in order to avoid wasting
+# resources on those trials.
+#
+# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced
+# schedulers like
+# `Population Based Training <https://docs.ray.io/en/master/tune/tutorials/tune-advanced-tutorial.html>`_.
+# Also, by saving the checkpoint we can later load the trained models and validate them
+# on a test set.
+#
+# Full training function
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# The full code example looks like this:
+
+
+def train_cifar(config, checkpoint_dir=None, data_dir=None):
+    net = Net(config["l1"], config["l2"])
+
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if torch.cuda.device_count() > 1:
+            net = nn.DataParallel(net)
+    net.to(device)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+
+    if checkpoint_dir:
+        model_state, optimizer_state = torch.load(
+            os.path.join(checkpoint_dir, "checkpoint"))
+        net.load_state_dict(model_state)
+        optimizer.load_state_dict(optimizer_state)
+
+    trainset, testset = load_data(data_dir)
+
+    test_abs = int(len(trainset) * 0.8)
+    train_subset, val_subset = random_split(
+        trainset, [test_abs, len(trainset) - test_abs])
+
+    trainloader = torch.utils.data.DataLoader(
+        train_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=8)
+    valloader = torch.utils.data.DataLoader(
+        val_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=8)
+
+    for epoch in range(10):  # loop over the dataset multiple times
+        running_loss = 0.0
+        epoch_steps = 0
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            epoch_steps += 1
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
+                                                running_loss / epoch_steps))
+                running_loss = 0.0
+
+        # Validation loss
+        val_loss = 0.0
+        val_steps = 0
+        total = 0
+        correct = 0
+        for i, data in enumerate(valloader, 0):
+            with torch.no_grad():
+                inputs, labels = data
+                inputs, labels = inputs.to(device), labels.to(device)
+
+                outputs = net(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+
+                loss = criterion(outputs, labels)
+                val_loss += loss.cpu().numpy()
+                val_steps += 1
+
+        with tune.checkpoint_dir(epoch) as checkpoint_dir:
+            path = os.path.join(checkpoint_dir, "checkpoint")
+            torch.save((net.state_dict(), optimizer.state_dict()), path)
+
+        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
+    print("Finished Training")
+
+######################################################################
+# As you can see, most of the code is adapted directly from the original example.
+#
+# Test set accuracy
+# -----------------
+# Commonly the performance of a machine learning model is tested on a hold-out test
+# set with data that has not been used for training the model. We also wrap this in a
+# function:
+
+
+def test_accuracy(net, device="cpu"):
+    trainset, testset = load_data()
+
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=4, shuffle=False, num_workers=2)
+
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(device), labels.to(device)
+            outputs = net(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    return correct / total
+
+######################################################################
+# The function also expects a ``device`` parameter, so we can do the
+# test set validation on a GPU.
+#
+# Configuring the search space
+# ----------------------------
+# Lastly, we need to define Ray Tune's search space. Here is an example:
+#
+# .. code-block:: python
+#
+#     config = {
+#         "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
+#         "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
+#         "lr": tune.loguniform(1e-4, 1e-1),
+#         "batch_size": tune.choice([2, 4, 8, 16])
+#     }
+#
+# The ``tune.sample_from()`` function makes it possible to define your own sample
+# methods to obtain hyperparameters. In this example, the ``l1`` and ``l2`` parameters
+# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256.
+# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly,
+# the batch size is a choice between 2, 4, 8, and 16.
+#
+# At each trial, Ray Tune will now randomly sample a combination of parameters from these
+# search spaces. It will then train a number of models in parallel and find the best
+# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad
+# performing trials early.
+#
+# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant
+# ``data_dir`` parameter. We can also tell Ray Tune what resources should be
+# available for each trial:
+#
+# .. code-block:: python
+#
+#     gpus_per_trial = 2
+#     # ...
+#     result = tune.run(
+#         partial(train_cifar, data_dir=data_dir),
+#         resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
+#         config=config,
+#         num_samples=num_samples,
+#         scheduler=scheduler,
+#         progress_reporter=reporter,
+#         checkpoint_at_end=True)
+#
+# You can specify the number of CPUs, which are then available e.g.
+# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected
+# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to
+# GPUs that haven't been requested for them - so you don't have to care about two trials
+# using the same set of resources.
+#
+# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is
+# completely valid. The trials will then share GPUs among each other.
+# You just have to make sure that the models still fit in the GPU memory.
+#
+# After training the models, we will find the best performing one and load the trained
+# network from the checkpoint file. We then obtain the test set accuracy and report
+# everything by printing.
+#
+# The full main function looks like this:
+
+
+def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
+    data_dir = os.path.abspath("./data")
+    load_data(data_dir)
+    config = {
+        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "lr": tune.loguniform(1e-4, 1e-1),
+        "batch_size": tune.choice([2, 4, 8, 16])
+    }
+    scheduler = ASHAScheduler(
+        metric="loss",
+        mode="min",
+        max_t=max_num_epochs,
+        grace_period=1,
+        reduction_factor=2)
+    reporter = CLIReporter(
+        # parameter_columns=["l1", "l2", "lr", "batch_size"],
+        metric_columns=["loss", "accuracy", "training_iteration"])
+    result = tune.run(
+        partial(train_cifar, data_dir=data_dir),
+        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
+        config=config,
+        num_samples=num_samples,
+        scheduler=scheduler,
+        progress_reporter=reporter)
+
+    best_trial = result.get_best_trial("loss", "min", "last")
+    print("Best trial config: {}".format(best_trial.config))
+    print("Best trial final validation loss: {}".format(
+        best_trial.last_result["loss"]))
+    print("Best trial final validation accuracy: {}".format(
+        best_trial.last_result["accuracy"]))
+
+    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if gpus_per_trial > 1:
+            best_trained_model = nn.DataParallel(best_trained_model)
+    best_trained_model.to(device)
+
+    best_checkpoint_dir = best_trial.checkpoint.value
+    model_state, optimizer_state = torch.load(os.path.join(
+        best_checkpoint_dir, "checkpoint"))
+    best_trained_model.load_state_dict(model_state)
+
+    test_acc = test_accuracy(best_trained_model, device)
+    print("Best trial test set accuracy: {}".format(test_acc))
+
+
+if __name__ == "__main__":
+    # You can change the number of GPUs per trial here:
+    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)
+
+
+######################################################################
+# If you run the code, an example output could look like this:
+#
+# .. code-block::
+#
+#     Number of trials: 10 (10 TERMINATED)
+#     +-----+------+------+-------------+--------------+---------+------------+--------------------+
+#     | ... |   l1 |   l2 |          lr |   batch_size |    loss |   accuracy | training_iteration |
+#     |-----+------+------+-------------+--------------+---------+------------+--------------------|
+#     | ... |   64 |    4 | 0.00011629  |            2 | 1.87273 |     0.244  |                  2 |
+#     | ... |   32 |   64 | 0.000339763 |            8 | 1.23603 |     0.567  |                  8 |
+#     | ... |    8 |   16 | 0.00276249  |           16 | 1.1815  |     0.5836 |                 10 |
+#     | ... |    4 |   64 | 0.000648721 |            4 | 1.31131 |     0.5224 |                  8 |
+#     | ... |   32 |   16 | 0.000340753 |            8 | 1.26454 |     0.5444 |                  8 |
+#     | ... |    8 |    4 | 0.000699775 |            8 | 1.99594 |     0.1983 |                  2 |
+#     | ... |  256 |    8 | 0.0839654   |           16 | 2.3119  |     0.0993 |                  1 |
+#     | ... |   16 |  128 | 0.0758154   |           16 | 2.33575 |     0.1327 |                  1 |
+#     | ... |   16 |    8 | 0.0763312   |           16 | 2.31129 |     0.1042 |                  4 |
+#     | ... |  128 |   16 | 0.000124903 |            4 | 2.26917 |     0.1945 |                  1 |
+#     +-----+------+------+-------------+--------------+---------+------------+--------------------+
+#
+#
+#     Best trial config: {'l1': 8, 'l2': 16, 'lr': 0.00276249, 'batch_size': 16, 'data_dir': '...'}
+#     Best trial final validation loss: 1.181501
+#     Best trial final validation accuracy: 0.5836
+#     Best trial test set accuracy: 0.5806
+#
+# Most trials have been stopped early in order to avoid wasting resources.
+# The best performing trial achieved a validation accuracy of about 58%, which could
+# be confirmed on the test set.
+#
+# So that's it! You can now tune the parameters of your PyTorch models.
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
index d61496d382c..2e60c20ab81 100644
--- a/beginner_source/nlp/pytorch_tutorial.py
+++ b/beginner_source/nlp/pytorch_tutorial.py
@@ -274,7 +274,7 @@
 
 ###############################################################
 # You can also stop autograd from tracking history on Tensors
-# with ``.requires_grad``=True by wrapping the code block in
+# with ``.requires_grad=True`` by wrapping the code block in
 # ``with torch.no_grad():``
 print(x.requires_grad)
 print((x ** 2).requires_grad)
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index 4db0361954d..40e4b79afd7 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -21,7 +21,7 @@
 part-of-speech tags, and a myriad of other things.
 
 
-LSTM's in Pytorch
+LSTMs in Pytorch
 ~~~~~~~~~~~~~~~~~
 
 Before getting to the example, note a few things. Pytorch's LSTM expects
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 0ba9711ed67..90c8b902d37 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -77,9 +77,9 @@ def init_weights(self):
         self.decoder.weight.data.uniform_(-initrange, initrange)
 
     def forward(self, src):
-        if self.src_mask is None or self.src_mask.size(0) != len(src):
+        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
             device = src.device
-            mask = self._generate_square_subsequent_mask(len(src)).to(device)
+            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
             self.src_mask = mask
 
         src = self.encoder(src) * math.sqrt(self.ninp)
diff --git a/index.rst b/index.rst
index a5ad877b0f4..06a24e8c76d 100644
--- a/index.rst
+++ b/index.rst
@@ -206,7 +206,7 @@ Welcome to PyTorch Tutorials
    :header: (prototype) Introduction to Named Tensors in PyTorch
    :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym.
    :image: _static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png
-   :link: intermediate/memory_format_tutorial.html
+   :link: intermediate/named_tensor_tutorial.html
    :tags: Frontend-APIs,Named-Tensor,Best-Practice
 
 .. customcarditem::
@@ -260,6 +260,13 @@ Welcome to PyTorch Tutorials
 
 .. Model Optimization
 
+.. customcarditem::
+   :header: Hyperparameter Tuning Tutorial
+   :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model.
+   :image: _static/img/ray-tune.png
+   :link: beginner/hyperparameter_tuning_tutorial.html
+   :tags: Model-Optimization,Best-Practice
+
 .. customcarditem::
    :header: Pruning Tutorial
    :card_description: Learn how to use torch.nn.utils.prune to sparsify your neural networks, and how to extend it to implement your own custom pruning technique.
@@ -516,6 +523,7 @@ Additional Resources
    :hidden:
    :caption: Model Optimization
 
+   beginner/hyperparameter_tuning_tutorial
    intermediate/pruning_tutorial
    advanced/dynamic_quantization_tutorial
    intermediate/dynamic_quantization_bert_tutorial
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index 244e23ac204..014f1668504 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -261,14 +261,17 @@ def check_cl(*args, **kwargs):
         return result
     return check_cl
 
+old_attrs = dict()
 
 def attribute(m):
+    old_attrs[m] = dict()
     for i in dir(m):
         e = getattr(m, i)
         exclude_functions = ['is_cuda', 'has_names', 'numel',
                              'stride', 'Tensor', 'is_contiguous', '__class__']
         if i not in exclude_functions and not i.startswith('_') and '__call__' in dir(e):
             try:
+                old_attrs[m][i] = e
                 setattr(m, i, check_wrapper(e))
             except Exception as e:
                 print(i)
@@ -286,6 +289,13 @@ def attribute(m):
 # guide https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators.
 #
 
+######################################################################
+# Code below is to recover the attributes of torch.
+
+for (m, attrs) in old_attrs.items():
+  for (k,v) in attrs.items():
+    setattr(m, k, v)
+
 ######################################################################
 # Work to do
 # ----------
diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py
index 515b689301a..62ca66db2bc 100644
--- a/intermediate_source/model_parallel_tutorial.py
+++ b/intermediate_source/model_parallel_tutorial.py
@@ -86,7 +86,7 @@ def forward(self, x):
 #
 # It is also possible to run an existing single-GPU module on multiple GPUs
 # with just a few lines of changes. The code below shows how to decompose
-# ``torchvision.models.reset50()`` to two GPUs. The idea is to inherit from
+# ``torchvision.models.resnet50()`` to two GPUs. The idea is to inherit from
 # the existing ``ResNet`` module, and split the layers to two GPUs during
 # construction. Then, override the ``forward`` method to stitch two
 # sub-networks by moving the intermediate outputs accordingly.
@@ -136,7 +136,7 @@ def forward(self, x):
 #
 # Let us run an experiment to get a more quantitative view of the execution
 # time. In this experiment, we train ``ModelParallelResNet50`` and the existing
-# ``torchvision.models.reset50()`` by running random inputs and labels through
+# ``torchvision.models.resnet50()`` by running random inputs and labels through
 # them. After the training, the models will not produce any useful predictions,
 # but we can get a reasonable understanding of the execution times.
 
@@ -245,7 +245,7 @@ def plot(means, stds, labels, fig_name):
 # -----------------------------
 #
 # In the following experiments, we further divide each 120-image batch into
-# 20-image splits. As PyTorch launches CUDA operations asynchronizely, the
+# 20-image splits. As PyTorch launches CUDA operations asynchronously, the
 # implementation does not need to spawn multiple threads to achieve
 # concurrency.
 
diff --git a/recipes_source/distributed_rpc_profiling.rst b/recipes_source/distributed_rpc_profiling.rst
new file mode 100644
index 00000000000..da9c003d1c7
--- /dev/null
+++ b/recipes_source/distributed_rpc_profiling.rst
@@ -0,0 +1,314 @@
+Profiling PyTorch RPC-Based Workloads
+======================================
+
+In this recipe, you will learn:
+
+-  An overview of the `Distributed RPC Framework`_
+-  An overview of the `PyTorch Profiler`_
+-  How to use the profiler to profile RPC-based workloads
+
+Requirements
+------------
+
+-  PyTorch 1.6
+
+The instructions for installing PyTorch are
+available at `pytorch.org`_.
+
+What is the Distributed RPC Framework?
+---------------------------------------
+
+The **Distributed RPC Framework** provides mechanisms for multi-machine model
+training through a set of primitives to allow for remote communication, and a 
+higher-level API to automatically differentiate models split across several machines.
+For this recipe, it would be helpful to be familiar with the `Distributed RPC Framework`_
+as well as the `RPC Tutorials`_. 
+
+What is the PyTorch Profiler?
+---------------------------------------
+The profiler is a context manager based API that allows for on-demand profiling of
+operators in a model's workload. The profiler can be used to analyze various aspects
+of a model including execution time, operators invoked, and memory consumption. For a
+detailed tutorial on using the profiler to profile a single-node model, please see the
+`Profiler Recipe`_.
+
+
+
+How to use the Profiler for RPC-based workloads
+-----------------------------------------------
+
+The profiler supports profiling of calls made of RPC and allows the user to have a
+detailed view into the operations that take place on different nodes. To demonstrate an
+example of this, let's first set up the RPC framework. The below code snippet will initialize
+two RPC workers on the same host, named ``worker0`` and ``worker1`` respectively. The workers will
+be spawned as subprocesses, and we set some environment variables required for proper
+initialization.
+
+::
+
+  import torch
+  import torch.distributed.rpc as rpc
+  import torch.autograd.profiler as profiler
+  import torch.multiprocessing as mp
+  import os
+  import logging
+  import sys
+
+  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+  logger = logging.getLogger()
+
+  def random_tensor():
+      return torch.rand((3, 3), requires_grad=True)
+
+
+  def worker(rank, world_size):
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "29500"
+      worker_name = f"worker{rank}"
+
+      # Initialize RPC framework.
+      rpc.init_rpc(
+          name=worker_name,
+          rank=rank,
+          world_size=world_size
+      )
+      logger.debug(f"{worker_name} successfully initialized RPC.")
+
+      pass # to be continued below
+
+      logger.debug(f"Rank {rank} waiting for workers and shutting down RPC")
+      rpc.shutdown()
+      logger.debug(f"Rank {rank} shutdown RPC")
+
+
+  if __name__ == '__main__':
+      # Run 2 RPC workers.
+      world_size = 2
+      mp.spawn(worker, args=(world_size,), nprocs=world_size)
+
+Running the above program should present you with the following output:
+
+::
+
+  DEBUG:root:worker1 successfully initialized RPC.
+  DEBUG:root:worker0 successfully initialized RPC.
+  DEBUG:root:Rank 0 waiting for workers and shutting down RPC
+  DEBUG:root:Rank 1 waiting for workers and shutting down RPC
+  DEBUG:root:Rank 1 shutdown RPC
+  DEBUG:root:Rank 0 shutdown RPC
+
+Now that we have a skeleton setup of our RPC framework, we can move on to 
+sending RPCs back and forth and using the profiler to obtain a view of what's
+happening under the hood. Let's add to the above ``worker`` function:
+
+::
+
+    def worker(rank, world_size):
+        # Above code omitted...
+        if rank == 0:
+            dst_worker_rank = (rank + 1) % world_size
+            dst_worker_name = f"worker{dst_worker_rank}"
+            t1, t2 = random_tensor(), random_tensor() 
+            # Send and wait RPC completion under profiling scope.
+            with profiler.profile() as prof:
+                fut1 = rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))
+                fut2 = rpc.rpc_async(dst_worker_name, torch.mul, args=(t1, t2))
+                # RPCs must be awaited within profiling scope.
+                fut1.wait()
+                fut2.wait()
+
+            print(prof.key_averages().table())
+
+The aformentioned code creates 2 RPCs, specifying ``torch.add`` and ``torch.mul``, respectively, 
+to be run with two random input tensors on worker 1. Since we use the ``rpc_async`` API, 
+we are returned a ``torch.futures.Future`` object, which must be awaited for the result
+of the computation. Note that this wait must take place within the scope created by
+the profiling context manager in order for the RPC to be accurately profiled. Running
+the code with this new worker function should result in the following output:
+
+:: 
+
+  # Some columns are omitted for brevity, exact output subject to randomness
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Name                                                              Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Node ID          
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  rpc_async#aten::add(worker0 -> worker1)                           0.00%            0.000us          0                20.462ms         20.462ms         1                0                         
+  rpc_async#aten::mul(worker0 -> worker1)                           0.00%            0.000us          0                5.712ms          5.712ms          1                0                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: mul            1.84%            206.864us        2.69%            302.162us        151.081us        2                1                
+  rpc_async#aten::add(worker0 -> worker1)#remote_op: add            1.41%            158.501us        1.57%            176.924us        176.924us        1                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: output_nr      0.04%            4.980us          0.04%            4.980us          2.490us          2                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: is_leaf        0.07%            7.806us          0.07%            7.806us          1.952us          4                1                
+  rpc_async#aten::add(worker0 -> worker1)#remote_op: empty          0.16%            18.423us         0.16%            18.423us         18.423us         1                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: empty          0.14%            15.712us         0.14%            15.712us         15.712us         1                1                
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Self CPU time total: 11.237ms
+
+Here we can see that the profiler has profiled our ``rpc_async`` calls made to ``worker1``
+from ``worker0``. In particular, the first 2 entries in the table show details (such as
+the operator name, originating worker, and destination worker) about each RPC call made
+and the ``CPU total`` column indicates the end-to-end latency of the RPC call. 
+
+We also have visibility into the actual operators invoked remotely on worker 1 due RPC.
+We can see operations that took place on ``worker1`` by checking the ``Node ID`` column. For 
+example, we can interpret the row with name ``rpc_async#aten::mul(worker0 -> worker1)#remote_op: mul``
+as a ``mul`` operation taking place on the remote node, as a result of the RPC sent to ``worker1``
+from ``worker0``, specifying ``worker1`` to run the builtin ``mul`` operator on the input tensors.
+Note that names of remote operations are prefixed with the name of the RPC event that resulted
+in them. For example, remote operations corresponding to the ``rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))``
+call are prefixed with ``rpc_async#aten::mul(worker0 -> worker1)``.
+
+We can also use the profiler to gain insight into user-defined functions that are executed over RPC. 
+For example, let's add the following to the above ``worker`` function:
+
+::
+
+  # Define somewhere outside of worker() func.
+  def udf_with_ops():
+      import time
+      time.sleep(1)
+      t1, t2 = random_tensor(), random_tensor()
+      torch.add(t1, t2)
+      torch.mul(t1, t2)
+
+  def worker(rank, world_size):
+      # Above code omitted
+      with profiler.profile() as p:
+          fut = rpc.rpc_async(dst_worker_name, udf_with_ops)
+          fut.wait()
+      print(p.key_averages().table())
+
+The above code creates a user-defined function that sleeps for 1 second, and then executes various
+operators. Similar to what we've done above, we send an RPC to the remote worker, specifying it to
+run our user-defined function. Running this code should result in the following output:
+
+::
+
+  # Exact output subject to randomness
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Name                                                                  Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Node ID          
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  rpc_async#udf_with_ops(worker0 -> worker1)                            0.00%            0.000us          0                1.008s           1.008s           1                0                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: rand            12.58%           80.037us         47.09%           299.589us        149.795us        2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: empty           15.40%           98.013us         15.40%           98.013us         24.503us         4                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: uniform_        22.85%           145.358us        23.87%           151.870us        75.935us         2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: is_complex      1.02%            6.512us          1.02%            6.512us          3.256us          2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: add             25.80%           164.179us        28.43%           180.867us        180.867us        1                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: mul             20.48%           130.293us        31.43%           199.949us        99.975us         2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: output_nr       0.71%            4.506us          0.71%            4.506us          2.253us          2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: is_leaf         1.16%            7.367us          1.16%            7.367us          1.842us          4                1                
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+
+Here we can see that the user-defined function has successfully been profiled with its name
+``(rpc_async#udf_with_ops(worker0 -> worker1))``, and has the CPU total time we would roughly expect
+(slightly greater than 1s given the ``sleep``). Similar to the above profiling output, we can see the
+remote operators that have been executed on worker 1 as part of executing this RPC request.
+
+Lastly, we can visualize remote execution using the tracing functionality provided by the profiler.
+Let's add the following code to the above ``worker`` function:
+
+::
+
+    def worker(rank, world_size):
+        # Above code omitted
+        # Will generate trace for above profiling output
+        trace_file = "/tmp/trace.json"
+        prof.export_chrome_trace(trace_file)
+        logger.debug(f"Wrote trace to {trace_file}")
+
+Now, we can load the trace file in Chrome (``chrome://tracing``). We should see output similar to
+the following:
+
+.. image:: ../_static/img/rpc_trace_img.png
+   :scale: 25 %
+
+As we can see, we have traced our RPC requests and can also visualize traces of the remote operations,
+in this case, given in the trace row for ``node_id: 1``.
+
+Putting it all together, we have the following code for this recipe:
+
+::
+
+    import torch
+    import torch.distributed.rpc as rpc
+    import torch.autograd.profiler as profiler
+    import torch.multiprocessing as mp
+    import os
+    import logging
+    import sys
+
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+    logger = logging.getLogger()
+
+    def random_tensor():
+      return torch.rand((3, 3), requires_grad=True)
+
+    def udf_with_ops():
+      import time
+      time.sleep(1)
+      t1, t2 = random_tensor(), random_tensor()
+      torch.add(t1, t2)
+      torch.mul(t1, t2)
+
+    def worker(rank, world_size):
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "29500"
+      worker_name = f"worker{rank}"
+
+      # Initialize RPC framework.
+      rpc.init_rpc(
+          name=worker_name,
+          rank=rank,
+          world_size=world_size
+      )
+      logger.debug(f"{worker_name} successfully initialized RPC.")
+
+      if rank == 0:
+        dst_worker_rank = (rank + 1) % world_size
+        dst_worker_name = f"worker{dst_worker_rank}"
+        t1, t2 = random_tensor(), random_tensor()
+        # Send and wait RPC completion under profiling scope.
+        with profiler.profile() as prof:
+            fut1 = rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))
+            fut2 = rpc.rpc_async(dst_worker_name, torch.mul, args=(t1, t2))
+            # RPCs must be awaited within profiling scope.
+            fut1.wait()
+            fut2.wait()
+        print(prof.key_averages().table())
+
+        with profiler.profile() as p:
+            fut = rpc.rpc_async(dst_worker_name, udf_with_ops)
+            fut.wait()
+
+        print(p.key_averages().table())
+
+        trace_file = "/tmp/trace.json"
+        prof.export_chrome_trace(trace_file)
+        logger.debug(f"Wrote trace to {trace_file}")
+
+
+      logger.debug(f"Rank {rank} waiting for workers and shutting down RPC")
+      rpc.shutdown()
+      logger.debug(f"Rank {rank} shutdown RPC")
+
+
+
+    if __name__ == '__main__':
+      # Run 2 RPC workers.
+      world_size = 2
+      mp.spawn(worker, args=(world_size,), nprocs=world_size)
+
+
+Learn More
+-------------------
+
+-  `pytorch.org`_ for installation instructions, and more documentation
+   and tutorials.
+-  `Distributed RPC Framework`_ for RPC framework and API reference.
+- `Full profiler documentation`_ for profiler documentation.
+
+.. _pytorch.org: https://pytorch.org/
+.. _Full profiler documentation: https://pytorch.org/docs/stable/autograd.html#profiler
+.. _Pytorch Profiler: https://pytorch.org/docs/stable/autograd.html#profiler
+.. _Distributed RPC Framework: https://pytorch.org/docs/stable/rpc.html
+.. _RPC Tutorials: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
+.. _Profiler Recipe: https://pytorch.org/tutorials/recipes/recipes/profiler.html
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index f93ee92c2c6..a182b0a11c5 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -56,3 +56,7 @@ PyTorch Recipes
 14. mobile_perf.py
          PyTorch Mobile Performance Recipes
          https://pytorch.org/tutorials/recipes/mobile_perf.html
+
+15. amp_recipe.py
+         Automatic Mixed Precision
+         https://pytorch.org/tutorials/recipes/amp_recipe.html
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
new file mode 100644
index 00000000000..c1ec52a3883
--- /dev/null
+++ b/recipes_source/recipes/amp_recipe.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""
+Automatic Mixed Precision
+*************************
+**Author**: `Michael Carilli <https://github.com/mcarilli>`_
+
+`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
+where some operations use the ``torch.float32`` (``float``) datatype and other operations
+use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
+are much faster in ``float16``. Other ops, like reductions, often require the dynamic
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype,
+which can reduce your network's runtime and memory footprint.
+
+Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
+`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
+
+This recipe measures the performance of a simple network in default precision,
+then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
+mixed precision with improved performance.
+
+You may download and run this recipe as a standalone Python script.
+The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
+
+Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
+This recipe should show significant (2-3X) speedup on those architectures.
+On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup.
+Run ``nvidia-smi`` to display your GPU's architecture.
+"""
+
+import torch, time, gc
+
+# Timing utilities
+start_time = None
+
+def start_timer():
+    global start_time
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.synchronize()
+    start_time = time.time()
+
+def end_timer_and_print(local_msg):
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print("\n" + local_msg)
+    print("Total execution time = {:.3f} sec".format(end_time - start_time))
+    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
+
+##########################################################
+# A simple network
+# ----------------
+# The following sequence of linear layers and ReLUs should show a speedup with mixed precision.
+
+def make_model(in_size, out_size, num_layers):
+    layers = []
+    for _ in range(num_layers - 1):
+        layers.append(torch.nn.Linear(in_size, in_size))
+        layers.append(torch.nn.ReLU())
+    layers.append(torch.nn.Linear(in_size, out_size))
+    return torch.nn.Sequential(*tuple(layers)).cuda()
+
+##########################################################
+# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
+# Typically, mixed precision provides the greatest speedup when the GPU is saturated.
+# Small networks may be CPU bound, in which case mixed precision won't improve performance.
+# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
+#
+# Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+
+batch_size = 512 # Try, for example, 128, 256, 513.
+in_size = 4096
+out_size = 4096
+num_layers = 3
+num_batches = 50
+epochs = 3
+
+# Creates data in default precision.
+# The same data is used for both default and mixed precision trials below.
+# You don't need to manually change inputs' dtype when enabling mixed precision.
+data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+
+loss_fn = torch.nn.MSELoss().cuda()
+
+##########################################################
+# Default Precision
+# -----------------
+# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``):
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        output = net(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Default precision:")
+
+##########################################################
+# Adding autocast
+# ---------------
+# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_
+# serve as context managers that allow regions of your script to run in mixed precision.
+#
+# In these regions, CUDA ops run in a dtype chosen by autocast
+# to improve performance while maintaining accuracy.
+# See the `Autocast Op Reference <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# for details on what precision autocast chooses for each op, and under what circumstances.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        # Runs the forward pass under autocast.
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            # output is float16 because linear layers autocast to float16.
+            assert output.dtype is torch.float16
+
+            loss = loss_fn(output, target)
+            # loss is float32 because mse_loss layers autocast to float32.
+            assert loss.dtype is torch.float32
+
+        # Exits autocast before backward().
+        # Backward passes under autocast are not recommended.
+        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Adding GradScaler
+# -----------------
+# `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# helps prevent gradients with small magnitudes from flushing to zero
+# ("underflowing") when training with mixed precision.
+#
+# `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
+# performs the steps of gradient scaling conveniently.
+
+# Constructs scaler once, at the beginning of the convergence run, using default args.
+# If your network fails to converge with default GradScaler args, please file an issue.
+# The same GradScaler instance should be used for the entire convergence run.
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+scaler = torch.cuda.amp.GradScaler()
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+
+        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+        scaler.scale(loss).backward()
+
+        # scaler.step() first unscales the gradients of the optimizer's assigned params.
+        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
+        # otherwise, optimizer.step() is skipped.
+        scaler.step(opt)
+
+        # Updates the scale for next iteration.
+        scaler.update()
+
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# All together: "Automatic Mixed Precision"
+# ------------------------------------------
+# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
+# This allows switching between default precision and mixed precision without if/else statements.)
+
+use_amp = True
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast(enabled=use_amp):
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Mixed precision:")
+
+##########################################################
+# Inspecting/modifying gradients (e.g., clipping)
+# --------------------------------------------------------
+# All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
+# unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+
+        # Unscales the gradients of optimizer's assigned params in-place
+        scaler.unscale_(opt)
+
+        # Since the gradients of optimizer's assigned params are now unscaled, clips as usual.
+        # You may use the same value for max_norm here as you would without gradient scaling.
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)
+
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Saving/Resuming
+# ----------------
+# To save/resume Amp-enabled runs with bitwise accuracy, use
+# `scaler.state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict>`_ and
+# `scaler.load_state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict>`_.
+#
+# When saving, save the scaler state dict alongside the usual model and optimizer state dicts.
+# Do this either at the beginning of an iteration before any forward passes, or at the end of
+# an iteration after ``scaler.update()``.
+
+checkpoint = {"model": net.state_dict(),
+              "optimizer": opt.state_dict(),
+              "scaler": scaler.state_dict()}
+# Write checkpoint as desired, e.g.,
+# torch.save(checkpoint, "filename")
+
+##########################################################
+# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
+
+# Read checkpoint as desired, e.g.,
+# dev = torch.cuda.current_device()
+# checkpoint = torch.load("filename",
+#                         map_location = lambda storage, loc: storage.cuda(dev))
+net.load_state_dict(checkpoint["model"])
+opt.load_state_dict(checkpoint["optimizer"])
+scaler.load_state_dict(checkpoint["scaler"])
+
+##########################################################
+# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
+# load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved scaler state, so
+# use a fresh instance of ``GradScaler``.
+#
+# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp,
+# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
+
+##########################################################
+# Inference/Evaluation
+# --------------------
+# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
+
+##########################################################
+# .. _advanced-topics:
+#
+# Advanced topics
+# ---------------
+# See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+#
+# * Gradient accumulation
+# * Gradient penalty/double backward
+# * Networks with multiple models, optimizers, or losses
+# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
+# * Custom autograd functions (subclasses of ``torch.autograd.Function``)
+#
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+#
+# If you're registering a custom C++ op with the dispatcher, see the
+# `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
+# of the dispatcher tutorial.
+
+##########################################################
+# .. _troubleshooting:
+#
+# Troubleshooting
+# ---------------
+# Speedup with Amp is minor
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+#    won't matter.
+#
+#    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#      as much as you can without running OOM.
+#    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
+#    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
+# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
+#    In this case a reduced speedup is expected.
+# 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
+#    (For NLP models with encoders/decoders, this can be subtle.  Also, convolutions used to have similar size constraints
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
+#
+# Loss is inf/NaN
+# ~~~~~~~~~~~~~~~
+# First, check if your network fits an :ref:`advanced use case<advanced-topics>`.
+# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
+#
+# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
+#
+# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
+# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32``
+#    and see if infs/NaNs persist.
+#    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_'s last code snippet
+#    shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
+#
+# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Autocast tries to cover all ops that benefit from or require casting.
+# `Ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are chosen based on numerical properties, but also on experience.
+# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region,
+# it's possible autocast missed an op.
+#
+# Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
+# fine-grained information on which backend op is failing.
diff --git a/recipes_source/recipes/defining_a_neural_network.py b/recipes_source/recipes/defining_a_neural_network.py
index bdb7ccfb375..f0a4ef69be3 100644
--- a/recipes_source/recipes/defining_a_neural_network.py
+++ b/recipes_source/recipes/defining_a_neural_network.py
@@ -26,7 +26,7 @@
 
 ::
 
-   pip install torchaudio
+   pip install torch
 
 
 """
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
new file mode 100644
index 00000000000..9d9726ae7e1
--- /dev/null
+++ b/recipes_source/recipes/tuning_guide.py
@@ -0,0 +1,370 @@
+"""
+Performance Tuning Guide
+*************************
+**Author**: `Szymon Migacz <https://github.com/szmigacz>`_
+
+Performance Tuning Guide is a set of optimizations and best practices which can
+accelerate training and inference of deep learning models in PyTorch. Presented
+techniques often can be implemented by changing only a few lines of code and can
+be applied to a wide range of deep learning models across all domains.
+
+General optimizations
+---------------------
+"""
+
+###############################################################################
+# Enable async data loading and augmentation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`_
+# supports asynchronous data loading and data augmentation in separate worker
+# subprocesses. The default setting for ``DataLoader`` is ``num_workers=0``,
+# which means that the data loading is synchronous and done in the main process.
+# As a result the main training process has to wait for the data to be available
+# to continue the execution.
+#
+# Setting ``num_workers > 0`` enables asynchronous data loading and overlap
+# between the training and data loading. ``num_workers`` should be tuned
+# depending on the workload, CPU, GPU, and location of training data.
+#
+# ``DataLoader`` accepts ``pin_memory`` argument, which defaults to ``False``.
+# When using a GPU it's better to set ``pin_memory=True``, this instructs
+# ``DataLoader`` to use pinned memory and enables faster and asynchronous memory
+# copy from the host to the GPU.
+
+###############################################################################
+# Disable gradient calculation for validation or inference
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch saves intermediate buffers from all operations which involve tensors
+# that require gradients. Typically gradients aren't needed for validation or
+# inference.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# context manager can be applied to disable gradient calculation within a
+# specified block of code, this accelerates execution and reduces the amount of
+# required memory.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# can also be used as a function decorator.
+
+###############################################################################
+# Disable bias for convolutions directly followed by a batch norm
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.Conv2d() <https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d>`_
+# has ``bias`` parameter which defaults to ``True`` (the same is true for
+# `Conv1d <https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d>`_
+# and
+# `Conv3d <https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html#torch.nn.Conv3d>`_
+# ).
+#
+# If a ``nn.Conv2d`` layer is directly followed by a ``nn.BatchNorm2d`` layer,
+# then the bias in the convolution is not needed, instead use
+# ``nn.Conv2d(..., bias=False, ....)``. Bias is not needed because in the first
+# step ``BatchNorm`` subtracts the mean, which effectively cancels out the
+# effect of bias.
+#
+# This is also applicable to 1d and 3d convolutions as long as ``BatchNorm`` (or
+# other normalization layer) normalizes on the same dimension as convolution's
+# bias.
+#
+# Models available from `torchvision <https://github.com/pytorch/vision>`_
+# already implement this optimization.
+
+###############################################################################
+# Use parameter.grad = None instead of model.zero_grad() or optimizer.zero_grad()
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling:
+model.zero_grad()
+# or
+optimizer.zero_grad()
+
+###############################################################################
+# to zero out gradients, use the following method instead:
+
+for param in model.parameters():
+    param.grad = None
+
+###############################################################################
+# The second code snippet does not zero the memory of each individual parameter,
+# also the subsequent backward pass uses assignment instead of addition to store
+# gradients, this reduces the number of memory operations.
+#
+# Setting gradient to ``None`` has a slightly different numerical behavior than
+# setting it to zero, for more details refer to the
+# `documentation <https://pytorch.org/docs/master/optim.html#torch.optim.Optimizer.zero_grad>`_.
+#
+# Alternatively, starting from PyTorch 1.7, call ``model`` or
+# ``optimizer.zero_grad(set_to_none=True)``.
+
+###############################################################################
+# Fuse pointwise operations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# Pointwise operations (elementwise addition, multiplication, math functions -
+# ``sin()``, ``cos()``, ``sigmoid()`` etc.) can be fused into a single kernel
+# to amortize memory access time and kernel launch time.
+#
+# `PyTorch JIT <https://pytorch.org/docs/stable/jit.html>`_ can fuse kernels
+# automatically, although there could be additional fusion opportunities not yet
+# implemented in the compiler, and not all device types are supported equally.
+#
+# Pointwise operations are memory-bound, for each operation PyTorch launches a
+# separate kernel. Each kernel loads data from the memory, performs computation
+# (this step is usually inexpensive) and stores results back into the memory.
+#
+# Fused operator launches only one kernel for multiple fused pointwise ops and
+# loads/stores data only once to the memory. This makes JIT very useful for
+# activation functions, optimizers, custom RNN cells etc.
+#
+# In the simplest case fusion can be enabled by applying
+# `torch.jit.script <https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script>`_
+# decorator to the function definition, for example:
+
+@torch.jit.script
+def fused_gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+
+###############################################################################
+# Refer to
+# `TorchScript documentation <https://pytorch.org/docs/stable/jit.html>`_
+# for more advanced use cases.
+
+###############################################################################
+# Enable channels_last memory format for computer vision models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch 1.5 introduced support for ``channels_last`` memory format for
+# convolutional networks. This format is meant to be used in conjunction with
+# `AMP <https://pytorch.org/docs/stable/amp.html>`_ to further accelerate
+# convolutional neural networks with
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_.
+#
+# Support for ``channels_last`` is experimental, but it's expected to work for
+# standard computer vision models (e.g. ResNet-50, SSD). To convert models to
+# ``channels_last`` format follow
+# `Channels Last Memory Format Tutorial <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html>`_.
+# The tutorial includes a section on
+# `converting existing models <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
+
+###############################################################################
+# Checkpoint intermediate buffers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Buffer checkpointing is a technique to mitigate the memory capacity burden of
+# model training. Instead of storing inputs of all layers to compute upstream
+# gradients in backward propagation, it stores the inputs of a few layers and
+# the others are recomputed during backward pass. The reduced memory
+# requirements enables increasing the batch size that can improve utilization.
+#
+# Checkpointing targets should be selected carefully. The best is not to store
+# large layer outputs that have small re-computation cost. The example target
+# layers are activation functions (e.g. ``ReLU``, ``Sigmoid``, ``Tanh``),
+# up/down sampling and matrix-vector operations with small accumulation depth.
+#
+# PyTorch supports a native
+# `torch.utils.checkpoint <https://pytorch.org/docs/stable/checkpoint.html>`_
+# API to automatically perform checkpointing and recomputation.
+
+###############################################################################
+# Disable debugging APIs
+# ~~~~~~~~~~~~~~~~~~~~~~
+# Many PyTorch APIs are intended for debugging and should be disabled for
+# regular training runs:
+#
+# * anomaly detection:
+#   `torch.autograd.detect_anomaly <https://pytorch.org/docs/stable/autograd.html#torch.autograd.detect_anomaly>`_
+#   or
+#   `torch.autograd.set_detect_anomaly(True) <https://pytorch.org/docs/stable/autograd.html#torch.autograd.set_detect_anomaly>`_
+# * profiler related:
+#   `torch.autograd.profiler.emit_nvtx <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx>`_,
+#   `torch.autograd.profiler.profile <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.profile>`_
+# * autograd gradcheck:
+#   `torch.autograd.gradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradcheck>`_
+#   or
+#   `torch.autograd.gradgradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradgradcheck>`_
+#
+
+###############################################################################
+# GPU specific optimizations
+# --------------------------
+
+###############################################################################
+# Enable cuDNN auto-tuner
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# `NVIDIA cuDNN <https://developer.nvidia.com/cudnn>`_ supports many algorithms
+# to compute a convolution. Autotuner runs a short benchmark and selects the
+# kernel with the best performance on a given hardware for a given input size.
+#
+# For convolutional networks (other types currently not supported), enable cuDNN
+# autotuner before launching the training loop by setting:
+
+torch.backends.cudnn.benchmark = True
+###############################################################################
+#
+# * the auto-tuner decisions may be non-deterministic; different algorithm may
+#   be selected for different runs.  For more details see
+#   `PyTorch: Reproducibility <https://pytorch.org/docs/stable/notes/randomness.html?highlight=determinism>`_
+# * in some rare cases, such as with highly variable input sizes,  it's better
+#   to run convolutional networks with autotuner disabled to avoid the overhead
+#   associated with algorithm selection for each input size.
+#
+
+###############################################################################
+# Avoid unnecessary CPU-GPU synchronization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Avoid unnecessary synchronizations, to let the CPU run ahead of the
+# accelerator as much as possible to make sure that the accelerator work queue
+# contains many operations.
+#
+# When possible, avoid operations which require synchronizations, for example:
+#
+# * ``print(cuda_tensor)``
+# * ``cuda_tensor.item()``
+# * memory copies: ``tensor.cuda()``,  ``cuda_tensor.cpu()`` and equivalent
+#   ``tensor.to(device)`` calls
+# * ``cuda_tensor.nonzero()``
+# * python control flow which depends on results of operations performed on cuda
+#   tensors e.g. ``if (cuda_tensor != 0).all()``
+#
+
+###############################################################################
+# Create tensors directly on the target device
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor,
+# produce the output directly on the target device:
+# ``torch.rand(size, device=torch.device('cuda'))``.
+#
+# This is applicable to all functions which create new tensors and accept
+# ``device`` argument:
+# `torch.rand() <https://pytorch.org/docs/stable/generated/torch.rand.html#torch.rand>`_,
+# `torch.zeros() <https://pytorch.org/docs/stable/generated/torch.zeros.html#torch.zeros>`_,
+# `torch.full() <https://pytorch.org/docs/stable/generated/torch.full.html#torch.full>`_
+# and similar.
+
+###############################################################################
+# Use mixed precision and AMP
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Mixed precision leverages
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_
+# and offers up to 3x overall speedup on Volta and newer GPU architectures. To
+# use Tensor Cores AMP should be enabled and matrix/tensor dimensions should
+# satisfy requirements for calling kernels that use Tensor Cores.
+#
+# To use Tensor Cores:
+#
+# * set sizes to multiples of 8 (to map onto dimensions of Tensor Cores)
+#
+#   * see
+#     `Deep Learning Performance Documentation
+#     <https://docs.nvidia.com/deeplearning/performance/index.html#optimizing-performance>`_
+#     for more details and guidelines specific to layer type
+#   * if layer size is derived from other parameters rather than fixed, it can
+#     still be explicitly padded e.g. vocabulary size in NLP models
+#
+# * enable AMP
+#
+#   * Introduction to Mixed Precision Training and AMP:
+#     `video <https://www.youtube.com/watch?v=jF4-_ZK_tyc&feature=youtu.be>`_,
+#     `slides <https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/dusan_stosic-training-neural-networks-with-tensor-cores.pdf>`_
+#   * native PyTorch AMP is available starting from PyTorch 1.6:
+#     `documentation <https://pytorch.org/docs/stable/amp.html>`_,
+#     `examples <https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples>`_,
+#     `tutorial <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_
+#
+#
+
+###############################################################################
+# Pre-allocate memory in case of variable input length
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Models for speech recognition or for NLP are often trained on input tensors
+# with variable sequence length. Variable length can be problematic for PyTorch
+# caching allocator and can lead to reduced performance or to unexpected
+# out-of-memory errors. If a batch with a short sequence length is followed by
+# an another batch with longer sequence length, then PyTorch is forced to
+# release intermediate buffers from previous iteration and to re-allocate new
+# buffers. This process is time consuming and causes fragmentation in the
+# caching allocator which may result in out-of-memory errors.
+#
+# A typical solution is to implement pre-allocation. It consists of the
+# following steps:
+#
+# #. generate a (usually random) batch of inputs with maximum sequence length
+#    (either corresponding to max length in the training dataset or to some
+#    predefined threshold)
+# #. execute a forward and a backward pass with the generated batch, do not
+#    execute an optimizer or a learning rate scheduler, this step pre-allocates
+#    buffers of maximum size, which can be reused in subsequent
+#    training iterations
+# #. zero out gradients
+# #. proceed to regular training
+#
+
+###############################################################################
+# Distributed optimizations
+# -------------------------
+
+###############################################################################
+# Use efficient data-parallel backend
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch has two ways to implement data-parallel training:
+#
+# * `torch.nn.DataParallel <https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html#torch.nn.DataParallel>`_
+# * `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+#
+# ``DistributedDataParallel`` offers much better performance and scaling to
+# multiple-GPUs. For more information refer to the
+# `relevant section of CUDA Best Practices <https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel>`_
+# from PyTorch documentation.
+
+###############################################################################
+# Skip unnecessary all-reduce if training with DistributedDataParallel and gradient accumulation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# By default
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# executes gradient all-reduce after every backward pass to compute the average
+# gradient over all workers participating in the training. If training uses
+# gradient accumulation over N steps, then all-reduce is not necessary after
+# every training step, it's only required to perform all-reduce after the last
+# call to backward, just before the execution of the optimizer.
+#
+# ``DistributedDataParallel`` provides
+# `no_sync() <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync>`_
+# context manager which disables gradient all-reduce for particular iteration.
+# ``no_sync()`` should be applied to first ``N-1`` iterations of gradient
+# accumulation, the last iteration should follow the default execution and
+# perform the required gradient all-reduce.
+
+###############################################################################
+# Match the order of layers in constructors and during the execution if using DistributedDataParallel(find_unused_parameters=True)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# with ``find_unused_parameters=True`` uses the order of layers and parameters
+# from model constructors to build buckets for ``DistributedDataParallel``
+# gradient all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the
+# backward pass. All-reduce for a particular bucket is asynchronously triggered
+# only when all gradients for parameters in a given bucket are available.
+#
+# To maximize the amount of overlap, the order in model constructors should
+# roughly match the order during the execution. If the order doesn't match, then
+# all-reduce for the entire bucket waits for the gradient which is the last to
+# arrive, this may reduce the overlap between backward pass and all-reduce,
+# all-reduce may end up being exposed, which slows down the training.
+#
+# ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is
+# the default setting) relies on automatic bucket formation based on order of
+# operations encountered during the backward pass. With
+# ``find_unused_parameters=False`` it's not necessary to reorder layers or
+# parameters to achieve optimal performance.
+
+###############################################################################
+# Load-balance workload in a distributed setting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Load imbalance typically may happen for models processing sequential data
+# (speech recognition, translation, language models etc.). If one device
+# receives a batch of data with sequence length longer than sequence lengths for
+# the remaining devices, then all devices wait for the worker which finishes
+# last. Backward pass functions as an implicit synchronization point in a
+# distributed setting with
+# `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# backend.
+#
+# There are multiple ways to solve the load balancing problem. The core idea is
+# to distribute workload over all workers as uniformly as possible within each
+# global batch. For example Transformer solves imbalance by forming batches with
+# approximately constant number of tokens (and variable number of sequences in a
+# batch), other models solve imbalance by bucketing samples with similar
+# sequence length or even by sorting dataset by sequence length.
diff --git a/recipes_source/recipes/what_is_state_dict.py b/recipes_source/recipes/what_is_state_dict.py
index 8e718e9071e..5e7f259fd7b 100644
--- a/recipes_source/recipes/what_is_state_dict.py
+++ b/recipes_source/recipes/what_is_state_dict.py
@@ -28,7 +28,7 @@
 
 ::
 
-   pip install torchaudio
+   pip install torch
 
 """
 
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index e842c19bae5..6d78ff4ec3c 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -1,6 +1,6 @@
 PyTorch Recipes
 ---------------------------------------------
-Recipes are bite-sized bite-sized, actionable examples of how to use specific PyTorch features, different from our full-length tutorials.
+Recipes are bite-sized, actionable examples of how to use specific PyTorch features, different from our full-length tutorials.
 
 .. raw:: html
 
@@ -40,14 +40,14 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
 
 .. customcarditem::
    :header: Defining a Neural Network
-   :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network the MNIST dataset.
+   :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network for the MNIST dataset.
    :image: ../_static/img/thumbnails/cropped/defining-a-network.PNG
    :link: ../recipes/recipes/defining_a_neural_network.html
    :tags: Basics
 
 .. customcarditem::
    :header: What is a state_dict in PyTorch
-   :card_description: Learn how state_dict objects, Python dictionaries, are used in saving or loading models from PyTorch.
+   :card_description: Learn how state_dict objects and Python dictionaries are used in saving or loading models from PyTorch.
    :image: ../_static/img/thumbnails/cropped/what-is-a-state-dict.PNG
    :link: ../recipes/recipes/what_is_state_dict.html
    :tags: Basics
@@ -90,7 +90,7 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
 
 .. customcarditem::
    :header: Zeroing out gradients in PyTorch
-   :card_description: Learn when you should zero out graidents and how doing so can help increase the accuracy of your model.
+   :card_description: Learn when you should zero out gradients and how doing so can help increase the accuracy of your model.
    :image: ../_static/img/thumbnails/cropped/zeroing-out-gradients.PNG
    :link: ../recipes/recipes/zeroing_out_gradients.html
    :tags: Basics
@@ -166,6 +166,31 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    :image: ../_static/img/thumbnails/cropped/android.png
    :link: ../recipes/android_native_app_with_custom_op.html
    :tags: Mobile
+   
+.. customcarditem::
+   :header: Profiling PyTorch RPC-Based Workloads
+   :card_description: How to use the PyTorch profiler to profile RPC-based workloads.
+   :image: ../_static/img/thumbnails/cropped/profile.png
+   :link: ../recipes/distributed_rpc_profiling.html
+   :tags: Production
+
+.. Automatic Mixed Precision
+
+.. customcarditem::
+   :header: Automatic Mixed Precision
+   :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
+   :image: ../_static/img/thumbnails/cropped/amp.png
+   :link: ../recipes/recipes/amp_recipe.html
+   :tags: Model-Optimization
+
+.. Performance
+
+.. customcarditem::
+   :header: Performance Tuning Guide
+   :card_description: Tips for achieving optimal performance.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/recipes/tuning_guide.html
+   :tags: Model-Optimization
 
 .. End of tutorial card section
 
@@ -199,6 +224,8 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    /recipes/recipes/Captum_Recipe
    /recipes/recipes/tensorboard_with_pytorch
    /recipes/recipes/dynamic_quantization
+   /recipes/recipes/amp_recipe
+   /recipes/recipes/tuning_guide
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling
diff --git a/requirements.txt b/requirements.txt
index a0aca3ca028..5e87cf36170 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ bs4
 awscli==1.16.35
 flask
 spacy
+ray[tune]
 
 # PyTorch Theme
 -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme