8
8
where some operations use the ``torch.float32`` (``float``) datatype and other operations
9
9
use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
10
10
are much faster in ``float16``. Other ops, like reductions, often require the dynamic
11
- range of ``float32``. Mixed precision tries to match each op to its appropriate datatype.
11
+ range of ``float32``. Mixed precision tries to match each op to its appropriate datatype,
12
12
which can reduce your network's runtime and memory footprint.
13
13
14
14
Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
15
15
`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
16
16
17
- This tutorial measures the performance of a simple network in default precision,
17
+ This recipe measures the performance of a simple network in default precision,
18
18
then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
19
19
mixed precision with improved performance.
20
20
21
- You may download and run this tutorial as a standalone Python script.
21
+ You may download and run this recipe as a standalone Python script.
22
22
The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
23
23
24
24
Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
@@ -62,7 +62,7 @@ def make_model(in_size, out_size, num_layers):
62
62
63
63
##########################################################
64
64
# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
65
- # Typically, mixed precision provides the greatest speedup when GPU is saturated.
65
+ # Typically, mixed precision provides the greatest speedup when the GPU is saturated.
66
66
# Small networks may be CPU bound, in which case mixed precision won't improve performance.
67
67
# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
68
68
# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
@@ -87,7 +87,7 @@ def make_model(in_size, out_size, num_layers):
87
87
##########################################################
88
88
# Default Precision
89
89
# -----------------
90
- # Without torch.cuda.amp, the following simple network executes all ops in default precision (``torch.float32``):
90
+ # Without `` torch.cuda.amp`` , the following simple network executes all ops in default precision (``torch.float32``):
91
91
92
92
net = make_model (in_size , out_size , num_layers )
93
93
opt = torch .optim .SGD (net .parameters (), lr = 0.001 )
@@ -139,7 +139,8 @@ def make_model(in_size, out_size, num_layers):
139
139
# helps prevent gradients with small magnitudes from flushing to zero
140
140
# ("underflowing") when training with mixed precision.
141
141
#
142
- # ``torch.cuda.amp.GradScaler`` performs the steps of gradient scaling conveniently.
142
+ # `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
143
+ # performs the steps of gradient scaling conveniently.
143
144
144
145
# Constructs scaler once, at the beginning of the convergence run, using default args.
145
146
# If your network fails to converge with default GradScaler args, please file an issue.
@@ -170,9 +171,9 @@ def make_model(in_size, out_size, num_layers):
170
171
##########################################################
171
172
# All together ("Automatic Mixed Precision")
172
173
# ------------------------------------------
173
- # The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
174
+ # ( The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
174
175
# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
175
- # This allows switching between default precision and mixed precision without if/else statements.
176
+ # This allows switching between default precision and mixed precision without if/else statements.)
176
177
177
178
use_amp = True
178
179
@@ -196,8 +197,8 @@ def make_model(in_size, out_size, num_layers):
196
197
# Inspecting/modifying gradients (e.g., clipping)
197
198
# --------------------------------------------------------
198
199
# All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect
199
- # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
200
- # unscale them first using `` scaler.unscale_(optimizer)`` .
200
+ # the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
201
+ # unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_ .
201
202
202
203
for epoch in range (0 ): # 0 epochs, this section is for illustration only
203
204
for input , target in zip (data , targets ):
@@ -232,6 +233,7 @@ def make_model(in_size, out_size, num_layers):
232
233
"optimizer" : opt .state_dict (),
233
234
"scaler" : scaler .state_dict ()}
234
235
236
+ ##########################################################
235
237
# (write checkpoint as desired, e.g., ``torch.save(checkpoint, "filename")``.)
236
238
#
237
239
# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
@@ -242,11 +244,12 @@ def make_model(in_size, out_size, num_layers):
242
244
opt .load_state_dict (checkpoint ["optimizer" ])
243
245
scaler .load_state_dict (checkpoint ["scaler" ])
244
246
245
- # If a checkpoint was created from a run _without_ mixed precision, and you want to resume training _with_ mixed precision,
247
+ ##########################################################
248
+ # If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
246
249
# load model and optimizer states from the checkpoint as usual. The checkpoint won't contain a saved scaler state, so
247
250
# use a fresh instance of ``GradScaler``.
248
251
#
249
- # If a checkpoint was created from a run _with_ mixed precision and you want to resume training _without_ mixed precision ,
252
+ # If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp ,
250
253
# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
251
254
252
255
##########################################################
0 commit comments