update out of place ops, update act fn, optimzers, schedulers, samplers (#989)

ptrblck · Jessica Lin · web-flow · commit ee6256c131fa · 2021-03-21T08:53:05.000-07:00
Co-authored-by: pbialecki &lt;pbialecki@nvidia.com&gt;
Co-authored-by: Jessica Lin &lt;jplin@fb.com&gt;
diff --git a/beginner_source/ptcheat.rst b/beginner_source/ptcheat.rst
@@ -80,8 +80,8 @@ Distributed Training
 
 .. code-block:: python
 
-    import torch.distributed as dist          # distributed communication
-    from multiprocessing import Process       # memory sharing processes
+    import torch.distributed as dist             # distributed communication
+    from torch.multiprocessing import Process    # memory sharing processes
 
 See `distributed <https://pytorch.org/docs/stable/distributed.html>`__
 and
@@ -95,13 +95,13 @@ Creation
 
 .. code-block:: python
 
-    torch.randn(*size)              # tensor with independent N(0,1) entries
-    torch.[ones|zeros](*size)       # tensor with all 1's [or 0's]
-    torch.Tensor(L)                 # create tensor from [nested] list or ndarray L
-    x.clone()                       # clone of x
-    with torch.no_grad():           # code wrap that stops autograd from tracking tensor history
-    requires_grad=True              # arg, when set to True, tracks computation 
-                                    # history for future derivative calculations
+    x = torch.randn(*size)              # tensor with independent N(0,1) entries
+    x = torch.[ones|zeros](*size)       # tensor with all 1's [or 0's]
+    x = torch.tensor(L)                 # create tensor from [nested] list or ndarray L
+    y = x.clone()                       # clone of x
+    with torch.no_grad():               # code wrap that stops autograd from tracking tensor history
+    requires_grad=True                  # arg, when set to True, tracks computation 
+                                        # history for future derivative calculations
 
 See `tensor <https://pytorch.org/docs/stable/tensors.html>`__
 
@@ -110,14 +110,16 @@ Dimensionality
 
 .. code-block:: python
 
-    x.size()                              # return tuple-like object of dimensions
-    torch.cat(tensor_seq, dim=0)          # concatenates tensors along dim
-    x.view(a,b,...)                       # reshapes x into size (a,b,...)
-    x.view(-1,a)                          # reshapes x into size (b,a) for some b
-    x.transpose(a,b)                      # swaps dimensions a and b
-    x.permute(*dims)                      # permutes dimensions
-    x.unsqueeze(dim)                      # tensor with added axis
-    x.unsqueeze(dim=2)                    # (a,b,c) tensor -> (a,b,1,c) tensor
+    x.size()                                  # return tuple-like object of dimensions
+    x = torch.cat(tensor_seq, dim=0)          # concatenates tensors along dim
+    y = x.view(a,b,...)                       # reshapes x into size (a,b,...)
+    y = x.view(-1,a)                          # reshapes x into size (b,a) for some b
+    y = x.transpose(a,b)                      # swaps dimensions a and b
+    y = x.permute(*dims)                      # permutes dimensions
+    y = x.unsqueeze(dim)                      # tensor with added axis
+    y = x.unsqueeze(dim=2)                    # (a,b,c) tensor -> (a,b,1,c) tensor
+    y = x.squeeze()                           # removes all dimensions of size 1 (a,1,b,1) -> (a,b)
+    y = x.squeeze(dim=1)                      # removes specified dimension of size 1 (a,1,b,1) -> (a,b,1)
 
 See `tensor <https://pytorch.org/docs/stable/tensors.html>`__
 
@@ -127,9 +129,9 @@ Algebra
 
 .. code-block:: python
 
-    A.mm(B)       # matrix multiplication
-    A.mv(x)       # matrix-vector multiplication
-    x.t()         # matrix transpose
+    ret = A.mm(B)       # matrix multiplication
+    ret = A.mv(x)       # matrix-vector multiplication
+    x = x.t()           # matrix transpose
 
 See `math
 operations <https://pytorch.org/docs/stable/torch.html?highlight=mm#math-operations>`__
@@ -139,24 +141,24 @@ GPU Usage
 
 .. code-block:: python
 
-    torch.cuda.is_available                                 # check for cuda
-    x.cuda()                                                # move x's data from 
-                                                            # CPU to GPU and return new object
+    torch.cuda.is_available                                     # check for cuda
+    x = x.cuda()                                                # move x's data from 
+                                                                # CPU to GPU and return new object
 
-    x.cpu()                                                 # move x's data from GPU to CPU 
-                                                            # and return new object
+    x = x.cpu()                                                 # move x's data from GPU to CPU 
+                                                                # and return new object
 
-    if not args.disable_cuda and torch.cuda.is_available(): # device agnostic code 
-        args.device = torch.device('cuda')                  # and modularity
-    else:                                                   #
-        args.device = torch.device('cpu')                   #
+    if not args.disable_cuda and torch.cuda.is_available():     # device agnostic code 
+        args.device = torch.device('cuda')                      # and modularity
+    else:                                                       #
+        args.device = torch.device('cpu')                       #
 
-    net.to(device)                                          # recursively convert their 
-                                                            # parameters and buffers to 
-                                                            # device specific tensors
+    net.to(device)                                              # recursively convert their 
+                                                                # parameters and buffers to 
+                                                                # device specific tensors
 
-    mytensor.to(device)                                     # copy your tensors to a device 
-                                                            # (gpu, cpu)
+    x = x.to(device)                                            # copy your tensors to a device 
+                                                                # (gpu, cpu)
 
 See `cuda <https://pytorch.org/docs/stable/cuda.html>`__
 
@@ -175,7 +177,7 @@ Deep Learning
     nn.MaxPoolXd(s)                               # X dimension pooling layer 
                                                   # (notation as above)
 
-    nn.BatchNorm                                  # batch norm layer
+    nn.BatchNormXd                                # batch norm layer
     nn.RNN/LSTM/GRU                               # recurrent layers
     nn.Dropout(p=0.5, inplace=False)              # dropout layer for any dimensional input
     nn.Dropout2d(p=0.5, inplace=False)            # 2-dimensional channel-wise dropout
@@ -189,11 +191,15 @@ Loss Functions
 
 .. code-block:: python
 
-    nn.X                                  # where X is BCELoss, CrossEntropyLoss, 
-                                          # L1Loss, MSELoss, NLLLoss, SoftMarginLoss,
-                                          # MultiLabelSoftMarginLoss, CosineEmbeddingLoss, 
-                                          # KLDivLoss, MarginRankingLoss, HingeEmbeddingLoss 
-                                          # or CosineEmbeddingLoss
+    nn.X                                  # where X is L1Loss, MSELoss, CrossEntropyLoss
+                                          # CTCLoss, NLLLoss, PoissonNLLLoss, 
+                                          # KLDivLoss, BCELoss, BCEWithLogitsLoss,
+                                          # MarginRankingLoss, HingeEmbeddingLoss,
+                                          # MultiLabelMarginLoss, SmoothL1Loss,
+                                          # SoftMarginLoss, MultiLabelSoftMarginLoss,
+                                          # CosineEmbeddingLoss, MultiMarginLoss,
+                                          # or TripletMarginLoss
+    
  
 See `loss
 functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`__
@@ -204,10 +210,10 @@ Activation Functions
 .. code-block:: python
 
     nn.X                                  # where X is ReLU, ReLU6, ELU, SELU, PReLU, LeakyReLU, 
-                                          # Threshold, HardTanh, Sigmoid, Tanh, 
-                                          # LogSigmoid, Softplus, SoftShrink, 
-                                          # Softsign, TanhShrink, Softmin, Softmax, 
-                                          # Softmax2d or LogSoftmax
+                                          # RReLu, CELU, GELU, Threshold, Hardshrink, HardTanh,
+                                          # Sigmoid, LogSigmoid, Softplus, SoftShrink, 
+                                          # Softsign, Tanh, TanhShrink, Softmin, Softmax, 
+                                          # Softmax2d, LogSoftmax or AdaptiveSoftmaxWithLoss
 
 See `activation
 functions <https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity>`__
@@ -220,8 +226,8 @@ Optimizers
     opt = optim.x(model.parameters(), ...)      # create optimizer
     opt.step()                                  # update weights
     optim.X                                     # where X is SGD, Adadelta, Adagrad, Adam, 
-                                                # SparseAdam, Adamax, ASGD, 
-                                                # LBFGS, RMSProp or Rprop
+                                                # AdamW, SparseAdam, Adamax, ASGD, 
+                                                # LBFGS, RMSprop or Rprop
 
 See `optimizers <https://pytorch.org/docs/stable/optim.html>`__
 
@@ -232,8 +238,10 @@ Learning rate scheduling
 
     scheduler = optim.X(optimizer,...)      # create lr scheduler
     scheduler.step()                        # update lr at start of epoch
-    optim.lr_scheduler.X                    # where X is LambdaLR, StepLR, MultiStepLR, 
-                  # ExponentialLR or ReduceLROnPLateau
+    optim.lr_scheduler.X                    # where X is LambdaLR, MultiplicativeLR,
+                                            # StepLR, MultiStepLR, ExponentialLR,
+                                            # CosineAnnealingLR, ReduceLROnPlateau, CyclicLR,
+                                            # OneCycleLR, CosineAnnealingWarmRestarts,
 
 See `learning rate
 scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`__
@@ -264,8 +272,8 @@ Dataloaders and DataSamplers
     sampler.Sampler(dataset,...)                # abstract class dealing with 
                                                 # ways to sample from dataset
 
-    sampler.XSampler where ...                  # Sequential, Random, Subset, 
-                                                # WeightedRandom or Distributed
+    sampler.XSampler where ...                  # Sequential, Random, SubsetRandom,
+                                                # WeightedRandom, Batch, Distributed
 
 See
 `dataloader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__