@@ -166,7 +166,7 @@ def make_model(in_size, out_size, num_layers):
166
166
# Updates the scale for next iteration.
167
167
scaler .update ()
168
168
169
- opt .zero_grad ()
169
+ opt .zero_grad () # set_to_none=True here can modestly improve performance
170
170
171
171
##########################################################
172
172
# All together: "Automatic Mixed Precision"
@@ -190,7 +190,7 @@ def make_model(in_size, out_size, num_layers):
190
190
scaler .scale (loss ).backward ()
191
191
scaler .step (opt )
192
192
scaler .update ()
193
- opt .zero_grad ()
193
+ opt .zero_grad () # set_to_none=True here can modestly improve performance
194
194
end_timer_and_print ("Mixed precision:" )
195
195
196
196
##########################################################
@@ -216,7 +216,7 @@ def make_model(in_size, out_size, num_layers):
216
216
217
217
scaler .step (opt )
218
218
scaler .update ()
219
- opt .zero_grad ()
219
+ opt .zero_grad () # set_to_none=True here can modestly improve performance
220
220
221
221
##########################################################
222
222
# Saving/Resuming
@@ -232,14 +232,16 @@ def make_model(in_size, out_size, num_layers):
232
232
checkpoint = {"model" : net .state_dict (),
233
233
"optimizer" : opt .state_dict (),
234
234
"scaler" : scaler .state_dict ()}
235
+ # Write checkpoint as desired, e.g.,
236
+ # torch.save(checkpoint, "filename")
235
237
236
238
##########################################################
237
- # (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.)
238
- #
239
239
# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
240
- # (read checkpoint as desired, e.g.,
241
- # ``checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))``)
242
240
241
+ # Read checkpoint as desired, e.g.,
242
+ # dev = torch.cuda.current_device()
243
+ # checkpoint = torch.load("filename",
244
+ # map_location = lambda storage, loc: storage.cuda(dev))
243
245
net .load_state_dict (checkpoint ["model" ])
244
246
opt .load_state_dict (checkpoint ["optimizer" ])
245
247
scaler .load_state_dict (checkpoint ["scaler" ])
@@ -294,7 +296,7 @@ def make_model(in_size, out_size, num_layers):
294
296
# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
295
297
# In this case a reduced speedup is expected.
296
298
# 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8.
297
- # (For NLP models with encoders/decoders, this can be subtle. Also. convolutions used to have similar size constraints
299
+ # (For NLP models with encoders/decoders, this can be subtle. Also, convolutions used to have similar size constraints
298
300
# for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See
299
301
# `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
300
302
#
0 commit comments