Merge branch 'master' into link

holly1238 · web-flow · commit a0adb7af43a5 · 2021-04-23T12:10:40.000-07:00
diff --git a/.circleci/scripts/build_for_windows.sh b/.circleci/scripts/build_for_windows.sh
@@ -49,6 +49,7 @@ if [[ "${CIRCLE_JOB}" == *worker_* ]]; then
   python $DIR/remove_runnable_code.py advanced_source/static_quantization_tutorial.py advanced_source/static_quantization_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/hyperparameter_tuning_tutorial.py beginner_source/hyperparameter_tuning_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/audio_preprocessing_tutorial.py  beginner_source/audio_preprocessing_tutorial.py || true
+  python $DIR/remove_runnable_code.py beginner_source/dcgan_faces_tutorial.py  beginner_source/dcgan_faces_tutorial.py || true
   python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true
   # Temp remove for mnist download issue. (Re-enabled for 1.8.1)
   # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py
@@ -47,7 +47,7 @@
 #
 # In this network, ``w`` and ``b`` are **parameters**, which we need to
 # optimize. Thus, we need to be able to compute the gradients of loss
-# function with respect to those variables. In orded to do that, we set
+# function with respect to those variables. In order to do that, we set
 # the ``requires_grad`` property of those tensors.
 
 #######################################################################
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
@@ -471,7 +471,7 @@ def trimRareWords(voc, pairs, MIN_COUNT):
 # with mini-batches.
 #
 # Using mini-batches also means that we must be mindful of the variation
-# of sentence length in our batches. To accomodate sentences of different
+# of sentence length in our batches. To accommodate sentences of different
 # sizes in the same batch, we will make our batched input tensor of shape
 # *(max_length, batch_size)*, where sentences shorter than the
 # *max_length* are zero padded after an *EOS_token*.
@@ -615,7 +615,7 @@ def batch2TrainData(voc, pair_batch):
 # in normal sequential order, and one that is fed the input sequence in
 # reverse order. The outputs of each network are summed at each time step.
 # Using a bidirectional GRU will give us the advantage of encoding both
-# past and future context.
+# past and future contexts.
 #
 # Bidirectional RNN:
 #
@@ -700,7 +700,7 @@ def forward(self, input_seq, input_lengths, hidden=None):
 # states to generate the next word in the sequence. It continues
 # generating words until it outputs an *EOS_token*, representing the end
 # of the sentence. A common problem with a vanilla seq2seq decoder is that
-# if we rely soley on the context vector to encode the entire input
+# if we rely solely on the context vector to encode the entire input
 # sequence’s meaning, it is likely that we will have information loss.
 # This is especially the case when dealing with long input sequences,
 # greatly limiting the capability of our decoder.
@@ -950,7 +950,7 @@ def maskNLLLoss(inp, target, mask):
 #   sequence (or batch of sequences). We use the ``GRU`` layer like this in
 #   the ``encoder``. The reality is that under the hood, there is an
 #   iterative process looping over each time step calculating hidden states.
-#   Alternatively, you ran run these modules one time-step at a time. In
+#   Alternatively, you can run these modules one time-step at a time. In
 #   this case, we manually loop over the sequences during the training
 #   process like we must do for the ``decoder`` model. As long as you
 #   maintain the correct conceptual model of these modules, implementing
@@ -1115,7 +1115,7 @@ def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, deco
 # softmax value. This decoding method is optimal on a single time-step
 # level.
 #
-# To facilite the greedy decoding operation, we define a
+# To facilitate the greedy decoding operation, we define a
 # ``GreedySearchDecoder`` class. When run, an object of this class takes
 # an input sequence (``input_seq``) of shape *(input_seq length, 1)*, a
 # scalar input length (``input_length``) tensor, and a ``max_length`` to
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
@@ -9,7 +9,7 @@
 All of deep learning is computations on tensors, which are
 generalizations of a matrix that can be indexed in more than 2
 dimensions. We will see exactly what this means in-depth later. First,
-lets look what we can do with tensors.
+let's look what we can do with tensors.
 """
 # Author: Robert Guthrie
 
@@ -162,7 +162,7 @@
 # other operation, etc.)
 #
 # If ``requires_grad=True``, the Tensor object keeps track of how it was
-# created. Lets see it in action.
+# created. Let's see it in action.
 #
 
 # Tensor factory methods have a ``requires_grad`` flag
@@ -187,7 +187,7 @@
 # But how does that help us compute a gradient?
 #
 
-# Lets sum up all the entries in z
+# Let's sum up all the entries in z
 s = z.sum()
 print(s)
 print(s.grad_fn)
@@ -222,7 +222,7 @@
 
 
 ######################################################################
-# Lets have Pytorch compute the gradient, and see that we were right:
+# Let's have Pytorch compute the gradient, and see that we were right:
 # (note if you run this block multiple times, the gradient will increment.
 # That is because Pytorch *accumulates* the gradient into the .grad
 # property, since for many models this is very convenient.)
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -45,15 +45,16 @@
 #
 
 import math
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
 
 class TransformerModel(nn.Module):
 
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         super(TransformerModel, self).__init__()
-        from torch.nn import TransformerEncoder, TransformerEncoderLayer
         self.model_type = 'Transformer'
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
@@ -251,12 +252,13 @@ def get_batch(source, i):
 # function to scale all the gradient together to prevent exploding.
 #
 
+import time
+
 criterion = nn.CrossEntropyLoss()
 lr = 5.0 # learning rate
 optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
 
-import time
 def train():
     model.train() # Turn on the train mode
     total_loss = 0.
diff --git a/index.rst b/index.rst
@@ -497,7 +497,7 @@ Additional Resources
    :header: PyTorch Cheat Sheet
    :description: Quick overview to essential PyTorch elements.
    :button_link: beginner/ptcheat.html
-   :button_text: Download
+   :button_text: Open
 
 .. customcalloutitem::
    :header: Tutorials on GitHub
@@ -509,7 +509,7 @@ Additional Resources
    :header: Run Tutorials on Google Colab
    :description: Learn how to copy tutorial data into Google Drive so that you can run tutorials on Google Colab.
    :button_link: beginner/colab.html
-   :button_text: Download
+   :button_text: Open
 
 .. End of callout section
 
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
@@ -64,7 +64,7 @@ training and evaluation, and will use the evaluation scripts from
 One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both classes, your ``labels`` tensor should look like ``[1,2]``.
 
 Additionally, if you want to use aspect ratio grouping during training
-(so that each batch only contains images with similar aspect ratio),
+(so that each batch only contains images with similar aspect ratios),
 then it is recommended to also implement a ``get_height_and_width``
 method, which returns the height and the width of the image. If this
 method is not provided, we query all elements of the dataset via

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@`
`47`	`47`	`#`
`48`	`48`	# In this network, ``w`` and ``b`` are parameters, which we need to
`49`	`49`	`# optimize. Thus, we need to be able to compute the gradients of loss`
`50`		`-# function with respect to those variables. In orded to do that, we set`
	`50`	`+# function with respect to those variables. In order to do that, we set`
`51`	`51`	# the ``requires_grad`` property of those tensors.
`52`	`52`
`53`	`53`	`#######################################################################`