Merge branch 'master' into quant-fix

andrewor14 · web-flow · commit 920e97a138ad · 2022-01-10T13:25:43.000-08:00
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
@@ -135,7 +135,7 @@ def forward(self, x):
 #####################################
 # Inside the training loop, optimization happens in three steps:
 #  * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
-#  * Backpropagate the prediction loss with a call to ``loss.backwards()``. PyTorch deposits the gradients of the loss w.r.t. each parameter.
+#  * Backpropagate the prediction loss with a call to ``loss.backward()``. PyTorch deposits the gradients of the loss w.r.t. each parameter.
 #  * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.
 
 
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
@@ -32,11 +32,16 @@
 
 
 Usage in PyTorch
-~~~~~~~~~~~
+~~~~~~~~~~~~~~~~
 Let's take a look at a single training step.
 For this example, we load a pretrained resnet18 model from ``torchvision``.
 We create a random data tensor to represent a single image with 3 channels, and height & width of 64,
-and its corresponding ``label`` initialized to some random values.
+and its corresponding ``label`` initialized to some random values. Label in pretrained models has
+shape (1,1000).
+
+.. note::
+    This tutorial work only on CPU and will not work on GPU (even if tensor are moved to CUDA).
+
 """
 import torch, torchvision
 model = torchvision.models.resnet18(pretrained=True)
@@ -61,7 +66,7 @@
 loss.backward() # backward pass
 
 ############################################################
-# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and momentum of 0.9.
+# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and `momentum <https://towardsdatascience.com/stochastic-gradient-descent-with-momentum-a84097641a5d>`__ of 0.9.
 # We register all the parameters of the model in the optimizer.
 #
 
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
@@ -110,7 +110,7 @@ def imshow(img):
 # show images
 imshow(torchvision.utils.make_grid(images))
 # print labels
-print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))
+print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))
 
 
 ########################################################################
@@ -182,8 +182,7 @@ def forward(self, x):
         # print statistics
         running_loss += loss.item()
         if i % 2000 == 1999:    # print every 2000 mini-batches
-            print('[%d, %5d] loss: %.3f' %
-                  (epoch + 1, i + 1, running_loss / 2000))
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
             running_loss = 0.0
 
 print('Finished Training')
@@ -215,7 +214,7 @@ def forward(self, x):
 
 # print images
 imshow(torchvision.utils.make_grid(images))
-print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
+print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))
 
 ########################################################################
 # Next, let's load back in our saved model (note: saving and re-loading the model
@@ -236,7 +235,7 @@ def forward(self, x):
 # So, let's get the index of the highest energy:
 _, predicted = torch.max(outputs, 1)
 
-print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
+print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                               for j in range(4)))
 
 ########################################################################
@@ -250,15 +249,14 @@ def forward(self, x):
 with torch.no_grad():
     for data in testloader:
         images, labels = data
-        # calculate outputs by running images through the network 
+        # calculate outputs by running images through the network
         outputs = net(images)
         # the class with the highest energy is what we choose as prediction
         _, predicted = torch.max(outputs.data, 1)
         total += labels.size(0)
         correct += (predicted == labels).sum().item()
 
-print('Accuracy of the network on the 10000 test images: %d %%' % (
-    100 * correct / total))
+print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
 
 ########################################################################
 # That looks way better than chance, which is 10% accuracy (randomly picking
@@ -275,21 +273,20 @@ def forward(self, x):
 # again no gradients needed
 with torch.no_grad():
     for data in testloader:
-        images, labels = data    
-        outputs = net(images)    
+        images, labels = data
+        outputs = net(images)
         _, predictions = torch.max(outputs, 1)
         # collect the correct predictions for each class
         for label, prediction in zip(labels, predictions):
             if label == prediction:
                 correct_pred[classes[label]] += 1
             total_pred[classes[label]] += 1
 
-  
+
 # print accuracy for each class
 for classname, correct_count in correct_pred.items():
     accuracy = 100 * float(correct_count) / total_pred[classname]
-    print("Accuracy for class {:5s} is: {:.1f} %".format(classname, 
-                                                   accuracy))
+    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
 
 ########################################################################
 # Okay, so what next?
@@ -304,7 +301,7 @@ def forward(self, x):
 # Let's first define our device as the first visible cuda device if we have
 # CUDA available:
 
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
 # Assuming that we are on a CUDA machine, this should print a CUDA device:
 
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
@@ -319,7 +319,7 @@ def weights_init(m):
 # .. figure:: /_static/img/dcgan_generator.png
 #    :alt: dcgan_generator
 #
-# Notice, the how the inputs we set in the input section (*nz*, *ngf*, and
+# Notice, how the inputs we set in the input section (*nz*, *ngf*, and
 # *nc*) influence the generator architecture in code. *nz* is the length
 # of the z input vector, *ngf* relates to the size of the feature maps
 # that are propagated through the generator, and *nc* is the number of
diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py
@@ -5,7 +5,7 @@
 
 To showcase the power of PyTorch dynamic graphs, we will implement a very strange
 model: a third-fifth order polynomial that on each forward pass
-chooses a random number between 3 and 5 and uses that many orders, reusing
+chooses a random number between 4 and 5 and uses that many orders, reusing
 the same weights multiple times to compute the fourth and fifth order.
 """
 import random
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
@@ -13,7 +13,7 @@
 
 A recurrent neural network is a network that maintains some kind of
 state. For example, its output could be used as part of the next input,
-so that information can propogate along as the network passes over the
+so that information can propagate along as the network passes over the
 sequence. In the case of an LSTM, for each element in the sequence,
 there is a corresponding *hidden state* :math:`h_t`, which in principle
 can contain information from arbitrary points earlier in the sequence.

Original file line number	Diff line number	Diff line change
`@@ -319,7 +319,7 @@ def weights_init(m):`
`319`	`319`	`# .. figure:: /_static/img/dcgan_generator.png`
`320`	`320`	`# :alt: dcgan_generator`
`321`	`321`	`#`
`322`		`-# Notice, the how the inputs we set in the input section (nz, ngf, and`
	`322`	`+# Notice, how the inputs we set in the input section (nz, ngf, and`
`323`	`323`	`# nc) influence the generator architecture in code. nz is the length`
`324`	`324`	`# of the z input vector, ngf relates to the size of the feature maps`
`325`	`325`	`# that are propagated through the generator, and nc is the number of`