pytorch
diff --git a/‎.circleci/scripts/build_for_windows.sh
Lines changed: 2 additions & 2 deletions b/‎.circleci/scripts/build_for_windows.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎_static/img/thumbnails/cropped/parametrizations.png
34.9 KB b/‎_static/img/thumbnails/cropped/parametrizations.png
34.9 KB
diff --git a/‎advanced_source/cpp_export.rst
Lines changed: 2 additions & 2 deletions b/‎advanced_source/cpp_export.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎advanced_source/cpp_extension.rst
Lines changed: 15 additions & 15 deletions b/‎advanced_source/cpp_extension.rst
Lines changed: 15 additions & 15 deletions
diff --git a/‎advanced_source/ddp_pipeline.py
Lines changed: 8 additions & 14 deletions b/‎advanced_source/ddp_pipeline.py
Lines changed: 8 additions & 14 deletions
diff --git a/‎advanced_source/super_resolution_with_onnxruntime.py
Lines changed: 1 addition & 1 deletion b/‎advanced_source/super_resolution_with_onnxruntime.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/Intro_to_TorchScript_tutorial.py
Lines changed: 7 additions & 3 deletions b/‎beginner_source/Intro_to_TorchScript_tutorial.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎beginner_source/PyTorch Cheat.md
Lines changed: 1 addition & 1 deletion b/‎beginner_source/PyTorch Cheat.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/basics/data_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/basics/data_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/basics/optimization_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/basics/optimization_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/basics/quickstart_tutorial.py
Lines changed: 1 addition & 1 deletion b/‎beginner_source/basics/quickstart_tutorial.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎beginner_source/basics/transforms_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/basics/transforms_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/blitz/cifar10_tutorial.py
Lines changed: 5 additions & 5 deletions b/‎beginner_source/blitz/cifar10_tutorial.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎beginner_source/blitz/neural_networks_tutorial.py
Lines changed: 2 additions & 2 deletions b/‎beginner_source/blitz/neural_networks_tutorial.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎beginner_source/data_loading_tutorial.py
Lines changed: 9 additions & 3 deletions b/‎beginner_source/data_loading_tutorial.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎beginner_source/dcgan_faces_tutorial.py
Lines changed: 3 additions & 3 deletions b/‎beginner_source/dcgan_faces_tutorial.py
Lines changed: 3 additions & 3 deletions
@@ -50,8 +50,8 @@ if [[ "${CIRCLE_JOB}" == *worker_* ]]; then
   python $DIR/remove_runnable_code.py beginner_source/hyperparameter_tuning_tutorial.py beginner_source/hyperparameter_tuning_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/audio_preprocessing_tutorial.py  beginner_source/audio_preprocessing_tutorial.py || true
   python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true
-  # Temp remove for mnist download issue.
-  python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
+  # Temp remove for mnist download issue. (Re-enabled for 1.8.1)
+  # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
 
   export WORKER_ID=$(echo "${CIRCLE_JOB}" | tr -dc '0-9')
   count=0
 
@@ -115,7 +115,7 @@ If you need to exclude some methods in your ``nn.Module``
 because they use Python features that TorchScript doesn't support yet,
 you could annotate those with ``@torch.jit.ignore``
 
-``my_module`` is an instance of
+``sm`` is an instance of
 ``ScriptModule`` that is ready for serialization.
 
 Step 2: Serializing Your Script Module to a File
@@ -132,7 +132,7 @@ on the module and pass it a filename::
   traced_script_module.save("traced_resnet_model.pt")
 
 This will produce a ``traced_resnet_model.pt`` file in your working directory.
-If you also would like to serialize ``my_module``, call ``my_module.save("my_module_model.pt")``
+If you also would like to serialize ``sm``, call ``sm.save("my_module_model.pt")``
 We have now officially left the realm of Python and are ready to cross over to the sphere
 of C++.
 
 
@@ -115,13 +115,13 @@ PyTorch has no knowledge of the *algorithm* you are implementing. It knows only
 of the individual operations you use to compose your algorithm. As such, PyTorch
 must execute your operations individually, one after the other. Since each
 individual call to the implementation (or *kernel*) of an operation, which may
-involve launch of a CUDA kernel, has a certain amount of overhead, this overhead
-may become significant across many function calls. Furthermore, the Python
-interpreter that is running our code can itself slow down our program.
+involve the launch of a CUDA kernel, has a certain amount of overhead, this
+overhead may become significant across many function calls. Furthermore, the
+Python interpreter that is running our code can itself slow down our program.
 
 A definite method of speeding things up is therefore to rewrite parts in C++ (or
 CUDA) and *fuse* particular groups of operations. Fusing means combining the
-implementations of many functions into a single functions, which profits from
+implementations of many functions into a single function, which profits from
 fewer kernel launches as well as other optimizations we can perform with
 increased visibility of the global flow of data.
 
@@ -509,12 +509,12 @@ and with our new C++ version::
   Forward: 349.335 us | Backward 443.523 us
 
 We can already see a significant speedup for the forward function (more than
-30%). For the backward function a speedup is visible, albeit not major one. The
-backward pass I wrote above was not particularly optimized and could definitely
-be improved. Also, PyTorch's automatic differentiation engine can automatically
-parallelize computation graphs, may use a more efficient flow of operations
-overall, and is also implemented in C++, so it's expected to be fast.
-Nevertheless, this is a good start.
+30%). For the backward function, a speedup is visible, albeit not a major one.
+The backward pass I wrote above was not particularly optimized and could
+definitely be improved. Also, PyTorch's automatic differentiation engine can
+automatically parallelize computation graphs, may use a more efficient flow of
+operations overall, and is also implemented in C++, so it's expected to be
+fast. Nevertheless, this is a good start.
 
 Performance on GPU Devices
 **************************
@@ -571,7 +571,7 @@ And C++/ATen::
 
 That's a great overall speedup compared to non-CUDA code. However, we can pull
 even more performance out of our C++ code by writing custom CUDA kernels, which
-we'll dive into soon. Before that, let's dicuss another way of building your C++
+we'll dive into soon. Before that, let's discuss another way of building your C++
 extensions.
 
 JIT Compiling Extensions
@@ -851,7 +851,7 @@ and ``Double``), you can use ``AT_DISPATCH_ALL_TYPES``.
 
 Note that we perform some operations with plain ATen. These operations will
 still run on the GPU, but using ATen's default implementations. This makes
-sense, because ATen will use highly optimized routines for things like matrix
+sense because ATen will use highly optimized routines for things like matrix
 multiplies (e.g. ``addmm``) or convolutions which would be much harder to
 implement and improve ourselves.
 
@@ -903,7 +903,7 @@ You can see in the CUDA kernel that we work directly on pointers with the right
 type. Indeed, working directly with high level type agnostic tensors inside cuda
 kernels would be very inefficient.
 
-However, this comes at a cost of ease of use and readibility, especially for
+However, this comes at a cost of ease of use and readability, especially for
 highly dimensional data. In our example, we know for example that the contiguous
 ``gates`` tensor has 3 dimensions:
 
@@ -920,7 +920,7 @@ arithmetic.
   gates.data<scalar_t>()[n*3*state_size + row*state_size + column]
 
 
-In addition to being verbose, this expression needs stride to be explicitely
+In addition to being verbose, this expression needs stride to be explicitly
 known, and thus passed to the kernel function within its arguments. You can see
 that in the case of kernel functions accepting multiple tensors with different
 sizes you will end up with a very long list of arguments.
@@ -1101,7 +1101,7 @@ on it:
     const int threads = 1024;
     const dim3 blocks((state_size + threads - 1) / threads, batch_size);
 
-    AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] {
+    AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_backward_cuda", ([&] {
       lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
           d_old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
           d_gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
 
@@ -89,7 +89,6 @@ def forward(self, x):
 class Encoder(nn.Module):
     def __init__(self, ntoken, ninp, dropout=0.5):
         super(Encoder, self).__init__()
-        self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         self.encoder = nn.Embedding(ntoken, ninp)
         self.ninp = ninp
@@ -99,17 +98,9 @@ def init_weights(self):
         initrange = 0.1
         self.encoder.weight.data.uniform_(-initrange, initrange)
 
-    def _generate_square_subsequent_mask(self, sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-        return mask
-
     def forward(self, src):
-        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
-            device = src.device
-            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
-            self.src_mask = mask
-
+        # Need (S, N) format for encoder.
+        src = src.t()
         src = self.encoder(src) * math.sqrt(self.ninp)
         return self.pos_encoder(src)
 
@@ -125,7 +116,8 @@ def init_weights(self):
         self.decoder.weight.data.uniform_(-initrange, initrange)
 
     def forward(self, inp):
-        return self.decoder(inp)
+        # Need batch dimension first for output of pipeline.
+        return self.decoder(inp).permute(1, 0, 2)
 
 ######################################################################
 # Start multiple processes for training
@@ -245,7 +237,8 @@ def get_batch(source, i):
         seq_len = min(bptt, len(source) - 1 - i)
         data = source[i:i+seq_len]
         target = source[i+1:i+1+seq_len].view(-1)
-        return data, target
+        # Need batch dimension first for pipeline parallelism.
+        return data.t(), target
 
 ######################################################################
 # Model scale and Pipe initialization
@@ -318,8 +311,9 @@ def get_batch(source, i):
     # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing
     # doesn't work with DDP.
     from torch.distributed.pipeline.sync import Pipe
+    chunks = 8
     model = Pipe(torch.nn.Sequential(
-        *module_list), chunks = 8, checkpoint="never")
+        *module_list), chunks = chunks, checkpoint="never")
 
     # Initialize process group and wrap model in DDP.
     from torch.nn.parallel import DistributedDataParallel
 
@@ -145,7 +145,7 @@ def _initialize_weights(self):
                   do_constant_folding=True,  # whether to execute constant folding for optimization
                   input_names = ['input'],   # the model's input names
                   output_names = ['output'], # the model's output names
-                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable lenght axes
+                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                 'output' : {0 : 'batch_size'}})
 
 ######################################################################
 
@@ -77,7 +77,7 @@ def forward(self, x, h):
 #    cell <https://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__–that
 #    is–it’s a function that is applied on a loop.
 #
-# We instantiated the module, and made ``x`` and ``y``, which are just 3x4
+# We instantiated the module, and made ``x`` and ``h``, which are just 3x4
 # matrices of random values. Then we invoked the cell with
 # ``my_cell(x, h)``. This in turn calls our ``forward`` function.
 #
@@ -274,6 +274,8 @@ def forward(self, x, h):
 
 my_cell = MyCell(MyDecisionGate())
 traced_cell = torch.jit.trace(my_cell, (x, h))
+
+print(traced_cell.dg.code)
 print(traced_cell.code)
 
 
@@ -293,8 +295,10 @@ def forward(self, x, h):
 scripted_gate = torch.jit.script(MyDecisionGate())
 
 my_cell = MyCell(scripted_gate)
-traced_cell = torch.jit.script(my_cell)
-print(traced_cell.code)
+scripted_cell = torch.jit.script(my_cell)
+
+print(scripted_gate.code)
+print(scripted_cell.code)
 
 
 ######################################################################
 
@@ -50,7 +50,7 @@ See [onnx](https://pytorch.org/docs/stable/onnx.html)
 from torchvision import datasets, models, transforms     # vision datasets, architectures & transforms
 import torchvision.transforms as transforms              # composable transforms
 ```
-See [torchvision](https://pytorch.org/docs/stable/torchvision/index.html)
+See [torchvision](https://pytorch.org/vision/stable/index.html)
 
 ### Distributed Training
 
 
@@ -25,7 +25,7 @@
 # PyTorch domain libraries provide a number of pre-loaded datasets (such as FashionMNIST) that 
 # subclass ``torch.utils.data.Dataset`` and implement functions specific to the particular data.
 # They can be used to prototype and benchmark your model. You can find them
-# here: `Image Datasets <https://pytorch.org/docs/stable/torchvision/datasets.html>`_,
+# here: `Image Datasets <https://pytorch.org/vision/stable/datasets.html>`_,
 # `Text Datasets  <https://pytorch.org/text/stable/datasets.html>`_, and
 # `Audio Datasets <https://pytorch.org/audio/stable/datasets.html>`_
 #
@@ -38,7 +38,7 @@
 # Fashion-MNIST is a dataset of Zalando’s article images consisting of of 60,000 training examples and 10,000 test examples.
 # Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.
 #
-# We load the `FashionMNIST Dataset <https://pytorch.org/docs/stable/torchvision/datasets.html#fashion-mnist>`_ with the following parameters:
+# We load the `FashionMNIST Dataset <https://pytorch.org/vision/stable/datasets.html#fashion-mnist>`_ with the following parameters:
 #  - ``root`` is the path where the train/test data is stored,
 #  - ``train`` specifies training or test dataset,
 #  - ``download=True`` downloads the data from the internet if it's not available at ``root``.
 
@@ -12,13 +12,13 @@
 Optimizing Model Parameters
 ===========================
 
-Now that we have a model and data it's time to train, validate and test our model by optimizing it's parameters on 
+Now that we have a model and data it's time to train, validate and test our model by optimizing its parameters on 
 our data. Training a model is an iterative process; in each iteration (called an *epoch*) the model makes a guess about the output, calculates 
 the error in its guess (*loss*), collects the derivatives of the error with respect to its parameters (as we saw in 
 the `previous section  <autograd_tutorial.html>`_), and **optimizes** these parameters using gradient descent. For a more 
 detailed walkthrough of this process, check out this video on `backpropagation from 3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
 
-Pre-requisite Code 
+Prerequisite Code 
 -----------------
 We load the code from the previous sections on `Datasets & DataLoaders <data_tutorial.html>`_ 
 and `Build Model  <buildmodel_tutorial.html>`_.
 
@@ -35,7 +35,7 @@
 # all of which include datasets. For this tutorial, we  will be using a TorchVision dataset.
 #
 # The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like 
-# CIFAR, COCO (`full list here <https://pytorch.org/docs/stable/torchvision/datasets.html>`_). In this tutorial, we
+# CIFAR, COCO (`full list here <https://pytorch.org/vision/stable/datasets.html>`_). In this tutorial, we
 # use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
 # ``target_transform`` to modify the samples and labels respectively.
 
 
@@ -18,7 +18,7 @@
 
 All TorchVision datasets have two parameters -``transform`` to modify the features and
 ``target_transform`` to modify the labels - that accept callables containing the transformation logic.
-The `torchvision.transforms <https://pytorch.org/docs/stable/torchvision/transforms.html>`_ module offers 
+The `torchvision.transforms <https://pytorch.org/vision/stable/transforms.html>`_ module offers 
 several commonly-used transforms out of the box.
 
 The FashionMNIST features are in PIL Image format, and the labels are integers.
@@ -41,7 +41,7 @@
 # ToTensor()
 # -------------------------------
 #
-# `ToTensor <https://pytorch.org/docs/stable/torchvision/transforms.html#torchvision.transforms.ToTensor>`_ 
+# `ToTensor <https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.ToTensor>`_ 
 # converts a PIL image or NumPy ``ndarray`` into a ``FloatTensor``. and scales 
 # the image's pixel intensity values in the range [0., 1.]
 #
 
@@ -43,15 +43,15 @@
 
 We will do the following steps in order:
 
-1. Load and normalizing the CIFAR10 training and test datasets using
+1. Load and normalize the CIFAR10 training and test datasets using
    ``torchvision``
 2. Define a Convolutional Neural Network
 3. Define a loss function
 4. Train the network on the training data
 5. Test the network on the test data
 
-1. Loading and normalizing CIFAR10
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1. Load and normalize CIFAR10
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Using ``torchvision``, it’s extremely easy to load CIFAR10.
 """
@@ -125,7 +125,7 @@ def imshow(img):
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
         self.conv2 = nn.Conv2d(6, 16, 5)
@@ -320,7 +320,7 @@ def forward(self, x):
 #
 #         inputs, labels = data[0].to(device), data[1].to(device)
 #
-# Why dont I notice MASSIVE speedup compared to CPU? Because your network
+# Why don't I notice MASSIVE speedup compared to CPU? Because your network
 # is really small.
 #
 # **Exercise:** Try increasing the width of your network (argument 2 of
 
@@ -58,7 +58,7 @@ def __init__(self):
     def forward(self, x):
         # Max pooling over a (2, 2) window
         x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
-        # If the size is a square you can only specify a single number
+        # If the size is a square, you can specify with a single number
         x = F.max_pool2d(F.relu(self.conv2(x)), 2)
         x = x.view(-1, self.num_flat_features(x))
         x = F.relu(self.fc1(x))
@@ -176,7 +176,7 @@ def num_flat_features(self, x):
 #           -> loss
 #
 # So, when we call ``loss.backward()``, the whole graph is differentiated
-# w.r.t. the loss, and all Tensors in the graph that has ``requires_grad=True``
+# w.r.t. the loss, and all Tensors in the graph that have ``requires_grad=True``
 # will have their ``.grad`` Tensor accumulated with the gradient.
 #
 # For illustration, let us follow a few steps backward:
 
@@ -4,7 +4,7 @@
 ===================================================
 **Author**: `Sasank Chilamkurthy <https://chsasank.github.io>`_
 
-A lot of effort in solving any machine learning problem goes in to
+A lot of effort in solving any machine learning problem goes into
 preparing the data. PyTorch provides many tools to make data loading
 easy and hopefully, to make your code more readable. In this tutorial,
 we will see how to load and preprocess/augment data from a non trivial
@@ -104,7 +104,7 @@ def show_landmarks(image, landmarks):
 #
 # -  ``__len__`` so that ``len(dataset)`` returns the size of the dataset.
 # -  ``__getitem__`` to support the indexing such that ``dataset[i]`` can
-#    be used to get :math:`i`\ th sample
+#    be used to get :math:`i`\ th sample.
 #
 # Let's create a dataset class for our face landmarks dataset. We will
 # read the csv in ``__init__`` but leave the reading of images to
@@ -290,7 +290,13 @@ def __call__(self, sample):
         image = image.transpose((2, 0, 1))
         return {'image': torch.from_numpy(image),
                 'landmarks': torch.from_numpy(landmarks)}
-
+    
+######################################################################
+# .. note::
+#     In the example above, `RandomCrop` uses an external library's random number generator 
+#     (in this case, Numpy's `np.random.int`). This can result in unexpected behavior with `DataLoader` 
+#     (see https://pytorch.org/docs/stable/notes/faq.html#my-data-loader-workers-return-identical-random-numbers). 
+#     In practice, it is safer to stick to PyTorch's random number generator, e.g. by using `torch.randint` instead.
 
 ######################################################################
 # Compose transforms
 
@@ -554,7 +554,7 @@ def forward(self, input):
 # reported are:
 # 
 # -  **Loss_D** - discriminator loss calculated as the sum of losses for
-#    the all real and all fake batches (:math:`log(D(x)) + log(D(G(z)))`).
+#    the all real and all fake batches (:math:`log(D(x)) + log(1 - D(G(z)))`).
 # -  **Loss_G** - generator loss calculated as :math:`log(D(G(z)))`
 # -  **D(x)** - the average output (across the batch) of the discriminator
 #    for the all real batch. This should start close to 1 then
@@ -610,10 +610,10 @@ def forward(self, input):
         output = netD(fake.detach()).view(-1)
         # Calculate D's loss on the all-fake batch
         errD_fake = criterion(output, label)
-        # Calculate the gradients for this batch
+        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
         errD_fake.backward()
         D_G_z1 = output.mean().item()
-        # Add the gradients from the all-real and all-fake batches
+        # Compute error of D as sum over the fake and the real batches
         errD = errD_real + errD_fake
         # Update D
         optimizerD.step()
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`# all of which include datasets. For this tutorial, we will be using a TorchVision dataset.`
`36`	`36`	`#`
`37`	`37`	# The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like
`38`		-# CIFAR, COCO (`full list here <https://pytorch.org/docs/stable/torchvision/datasets.html>`_). In this tutorial, we
	`38`	+# CIFAR, COCO (`full list here <https://pytorch.org/vision/stable/datasets.html>`_). In this tutorial, we
`39`	`39`	# use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
`40`	`40`	# ``target_transform`` to modify the samples and labels respectively.
`41`	`41`