Skip to content

Commit 220e447

Browse files
authored
Merge branch 'master' into patch-1
2 parents 9bc959a + be9d2e6 commit 220e447

File tree

5 files changed

+18
-20
lines changed

5 files changed

+18
-20
lines changed

beginner_source/Intro_to_TorchScript_tutorial.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,8 @@ def forward(self, x, h):
274274

275275
my_cell = MyCell(MyDecisionGate())
276276
traced_cell = torch.jit.trace(my_cell, (x, h))
277+
278+
print(traced_cell.dg.code)
277279
print(traced_cell.code)
278280

279281

@@ -293,8 +295,10 @@ def forward(self, x, h):
293295
scripted_gate = torch.jit.script(MyDecisionGate())
294296

295297
my_cell = MyCell(scripted_gate)
296-
traced_cell = torch.jit.script(my_cell)
297-
print(traced_cell.code)
298+
scripted_cell = torch.jit.script(my_cell)
299+
300+
print(scripted_gate.code)
301+
print(scripted_cell.code)
298302

299303

300304
######################################################################

beginner_source/transformer_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def train():
281281
print('| epoch {:3d} | {:5d}/{:5d} batches | '
282282
'lr {:02.2f} | ms/batch {:5.2f} | '
283283
'loss {:5.2f} | ppl {:8.2f}'.format(
284-
epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
284+
epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
285285
elapsed * 1000 / log_interval,
286286
cur_loss, math.exp(cur_loss)))
287287
total_loss = 0

intermediate_source/dynamic_quantization_bert_tutorial.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ follows:
494494
| FP32 | 0.9019 | 438 MB | 160 sec | 85 sec |
495495
| INT8 | 0.902 | 181 MB | 90 sec | 46 sec |
496496
497-
We have 0.6% F1 score accuracy after applying the post-training dynamic
497+
We have 0.6% lower F1 score accuracy after applying the post-training dynamic
498498
quantization on the fine-tuned BERT model on the MRPC task. As a
499499
comparison, in a `recent paper <https://arxiv.org/pdf/1910.06188.pdf>`_ (Table 1),
500500
it achieved 0.8788 by
@@ -541,7 +541,7 @@ To load the quantized model, we can use `torch.jit.load`
541541
Conclusion
542542
----------
543543

544-
In this tutorial, we demonstrated how to demonstrate how to convert a
544+
In this tutorial, we demonstrated how to convert a
545545
well-known state-of-the-art NLP model like BERT into dynamic quantized
546546
model. Dynamic quantization can reduce the size of the model while only
547547
having a limited implication on accuracy.

intermediate_source/pipeline_tutorial.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@
5656
class Encoder(nn.Module):
5757
def __init__(self, ntoken, ninp, dropout=0.5):
5858
super(Encoder, self).__init__()
59-
self.src_mask = None
6059
self.pos_encoder = PositionalEncoding(ninp, dropout)
6160
self.encoder = nn.Embedding(ntoken, ninp)
6261
self.ninp = ninp
@@ -66,17 +65,9 @@ def init_weights(self):
6665
initrange = 0.1
6766
self.encoder.weight.data.uniform_(-initrange, initrange)
6867

69-
def _generate_square_subsequent_mask(self, sz):
70-
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
71-
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
72-
return mask
73-
7468
def forward(self, src):
75-
if self.src_mask is None or self.src_mask.size(0) != src.size(0):
76-
device = src.device
77-
mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
78-
self.src_mask = mask
79-
69+
# Need (S, N) format for encoder.
70+
src = src.t()
8071
src = self.encoder(src) * math.sqrt(self.ninp)
8172
return self.pos_encoder(src)
8273

@@ -92,7 +83,8 @@ def init_weights(self):
9283
self.decoder.weight.data.uniform_(-initrange, initrange)
9384

9485
def forward(self, inp):
95-
return self.decoder(inp)
86+
# Need batch dimension first for output of pipeline.
87+
return self.decoder(inp).permute(1, 0, 2)
9688

9789

9890
######################################################################
@@ -221,7 +213,8 @@ def get_batch(source, i):
221213
seq_len = min(bptt, len(source) - 1 - i)
222214
data = source[i:i+seq_len]
223215
target = source[i+1:i+1+seq_len].view(-1)
224-
return data, target
216+
# Need batch dimension first for pipeline parallelism.
217+
return data.t(), target
225218

226219
######################################################################
227220
# Model scale and Pipe initialization
@@ -297,7 +290,8 @@ def get_batch(source, i):
297290
from torch.distributed.pipeline.sync import Pipe
298291

299292
# Build the pipeline.
300-
model = Pipe(torch.nn.Sequential(*module_list), chunks = 8)
293+
chunks = 8
294+
model = Pipe(torch.nn.Sequential(*module_list), chunks = chunks)
301295

302296

303297
def get_total_params(module: torch.nn.Module):

intermediate_source/torchvision_tutorial.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ way of doing it:
263263
# be [0]. More generally, the backbone should return an
264264
# OrderedDict[Tensor], and in featmap_names you can choose which
265265
# feature maps to use.
266-
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
266+
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
267267
output_size=7,
268268
sampling_ratio=2)
269269

0 commit comments

Comments
 (0)