Merge branch 'master' into master

vijaykriishna · web-flow · commit 8b05510a8fee · 2021-04-20T01:52:57.000+05:30
diff --git a/.circleci/scripts/build_for_windows.sh b/.circleci/scripts/build_for_windows.sh
@@ -49,8 +49,8 @@ if [[ "${CIRCLE_JOB}" == *worker_* ]]; then
   python $DIR/remove_runnable_code.py advanced_source/static_quantization_tutorial.py advanced_source/static_quantization_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/hyperparameter_tuning_tutorial.py beginner_source/hyperparameter_tuning_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/audio_preprocessing_tutorial.py  beginner_source/audio_preprocessing_tutorial.py || true
-  # Temp remove for mnist download issue.
-  python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
+  # Temp remove for mnist download issue. (Re-enabled for 1.8.1)
+  # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
 
   export WORKER_ID=$(echo "${CIRCLE_JOB}" | tr -dc '0-9')
   count=0
diff --git a/advanced_source/ddp_pipeline.py b/advanced_source/ddp_pipeline.py
@@ -89,7 +89,6 @@ def forward(self, x):
 class Encoder(nn.Module):
     def __init__(self, ntoken, ninp, dropout=0.5):
         super(Encoder, self).__init__()
-        self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         self.encoder = nn.Embedding(ntoken, ninp)
         self.ninp = ninp
@@ -99,17 +98,9 @@ def init_weights(self):
         initrange = 0.1
         self.encoder.weight.data.uniform_(-initrange, initrange)
 
-    def _generate_square_subsequent_mask(self, sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-        return mask
-
     def forward(self, src):
-        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
-            device = src.device
-            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
-            self.src_mask = mask
-
+        # Need (S, N) format for encoder.
+        src = src.t()
         src = self.encoder(src) * math.sqrt(self.ninp)
         return self.pos_encoder(src)
 
@@ -125,7 +116,8 @@ def init_weights(self):
         self.decoder.weight.data.uniform_(-initrange, initrange)
 
     def forward(self, inp):
-        return self.decoder(inp)
+        # Need batch dimension first for output of pipeline.
+        return self.decoder(inp).permute(1, 0, 2)
 
 ######################################################################
 # Start multiple processes for training
@@ -245,7 +237,8 @@ def get_batch(source, i):
         seq_len = min(bptt, len(source) - 1 - i)
         data = source[i:i+seq_len]
         target = source[i+1:i+1+seq_len].view(-1)
-        return data, target
+        # Need batch dimension first for pipeline parallelism.
+        return data.t(), target
 
 ######################################################################
 # Model scale and Pipe initialization
@@ -318,8 +311,9 @@ def get_batch(source, i):
     # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing
     # doesn't work with DDP.
     from torch.distributed.pipeline.sync import Pipe
+    chunks = 8
     model = Pipe(torch.nn.Sequential(
-        *module_list), chunks = 8, checkpoint="never")
+        *module_list), chunks = chunks, checkpoint="never")
 
     # Initialize process group and wrap model in DDP.
     from torch.nn.parallel import DistributedDataParallel
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
@@ -18,7 +18,7 @@
 the `previous section  <autograd_tutorial.html>`_), and **optimizes** these parameters using gradient descent. For a more 
 detailed walkthrough of this process, check out this video on `backpropagation from 3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
 
-Pre-requisite Code 
+Prerequisite Code 
 -----------------
 We load the code from the previous sections on `Datasets & DataLoaders <data_tutorial.html>`_ 
 and `Build Model  <buildmodel_tutorial.html>`_.