Merge branch 'master' into patch-1

holly1238 · web-flow · commit 220e447bf9ea · 2021-04-12T08:05:20.000-07:00
diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py
@@ -274,6 +274,8 @@ def forward(self, x, h):
 
 my_cell = MyCell(MyDecisionGate())
 traced_cell = torch.jit.trace(my_cell, (x, h))
+
+print(traced_cell.dg.code)
 print(traced_cell.code)
 
 
@@ -293,8 +295,10 @@ def forward(self, x, h):
 scripted_gate = torch.jit.script(MyDecisionGate())
 
 my_cell = MyCell(scripted_gate)
-traced_cell = torch.jit.script(my_cell)
-print(traced_cell.code)
+scripted_cell = torch.jit.script(my_cell)
+
+print(scripted_gate.code)
+print(scripted_cell.code)
 
 
 ######################################################################
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -281,7 +281,7 @@ def train():
             print('| epoch {:3d} | {:5d}/{:5d} batches | '
                   'lr {:02.2f} | ms/batch {:5.2f} | '
                   'loss {:5.2f} | ppl {:8.2f}'.format(
-                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
+                    epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                     elapsed * 1000 / log_interval,
                     cur_loss, math.exp(cur_loss)))
             total_loss = 0
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -494,7 +494,7 @@ follows:
    | FP32 |  0.9019  |   438 MB   | 160 sec  | 85 sec    |
    | INT8 |  0.902   |   181 MB   |  90 sec  | 46 sec    |
 
-We have 0.6% F1 score accuracy after applying the post-training dynamic
+We have 0.6% lower F1 score accuracy after applying the post-training dynamic
 quantization on the fine-tuned BERT model on the MRPC task. As a
 comparison, in a `recent paper <https://arxiv.org/pdf/1910.06188.pdf>`_ (Table 1),
 it achieved 0.8788 by
@@ -541,7 +541,7 @@ To load the quantized model, we can use `torch.jit.load`
 Conclusion
 ----------
 
-In this tutorial, we demonstrated how to demonstrate how to convert a
+In this tutorial, we demonstrated how to convert a
 well-known state-of-the-art NLP model like BERT into dynamic quantized
 model. Dynamic quantization can reduce the size of the model while only
 having a limited implication on accuracy.
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
@@ -56,7 +56,6 @@
 class Encoder(nn.Module):
     def __init__(self, ntoken, ninp, dropout=0.5):
         super(Encoder, self).__init__()
-        self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         self.encoder = nn.Embedding(ntoken, ninp)
         self.ninp = ninp
@@ -66,17 +65,9 @@ def init_weights(self):
         initrange = 0.1
         self.encoder.weight.data.uniform_(-initrange, initrange)
 
-    def _generate_square_subsequent_mask(self, sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-        return mask
-
     def forward(self, src):
-        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
-            device = src.device
-            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
-            self.src_mask = mask
-
+        # Need (S, N) format for encoder.
+        src = src.t()
         src = self.encoder(src) * math.sqrt(self.ninp)
         return self.pos_encoder(src)
 
@@ -92,7 +83,8 @@ def init_weights(self):
         self.decoder.weight.data.uniform_(-initrange, initrange)
 
     def forward(self, inp):
-        return self.decoder(inp)
+        # Need batch dimension first for output of pipeline.
+        return self.decoder(inp).permute(1, 0, 2)
 
 
 ######################################################################
@@ -221,7 +213,8 @@ def get_batch(source, i):
     seq_len = min(bptt, len(source) - 1 - i)
     data = source[i:i+seq_len]
     target = source[i+1:i+1+seq_len].view(-1)
-    return data, target
+    # Need batch dimension first for pipeline parallelism.
+    return data.t(), target
 
 ######################################################################
 # Model scale and Pipe initialization
@@ -297,7 +290,8 @@ def get_batch(source, i):
 from torch.distributed.pipeline.sync import Pipe
 
 # Build the pipeline.
-model = Pipe(torch.nn.Sequential(*module_list), chunks = 8)
+chunks = 8
+model = Pipe(torch.nn.Sequential(*module_list), chunks = chunks)
 
 
 def get_total_params(module: torch.nn.Module):
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
@@ -263,7 +263,7 @@ way of doing it:
    # be [0]. More generally, the backbone should return an
    # OrderedDict[Tensor], and in featmap_names you can choose which
    # feature maps to use.
-   roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
+   roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                    output_size=7,
                                                    sampling_ratio=2)