@@ -128,15 +128,15 @@ def fused_gelu(x):
128
128
###############################################################################
129
129
# Enable channels_last memory format for computer vision models
130
130
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
131
- # PyTorch 1.5 introduced support for `channels_last` memory format for
131
+ # PyTorch 1.5 introduced support for `` channels_last` ` memory format for
132
132
# convolutional networks. This format is meant to be used in conjunction with
133
133
# `AMP <https://pytorch.org/docs/stable/amp.html>`_ to further accelerate
134
134
# convolutional neural networks with
135
135
# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_.
136
136
#
137
- # Support for `channels_last` is experimental, but it's expected to work for
137
+ # Support for `` channels_last` ` is experimental, but it's expected to work for
138
138
# standard computer vision models (e.g. ResNet-50, SSD). To convert models to
139
- # `channels_last` format follow
139
+ # `` channels_last` ` format follow
140
140
# `Channels Last Memory Format Tutorial <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html>`_.
141
141
# The tutorial includes a section on
142
142
# `converting existing models <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
@@ -321,10 +321,10 @@ def fused_gelu(x):
321
321
# every training step, it's only required to perform all-reduce after the last
322
322
# call to backward, just before the execution of the optimizer.
323
323
#
324
- # `DistributedDataParallel` provides
324
+ # `` DistributedDataParallel` ` provides
325
325
# `no_sync() <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync>`_
326
326
# context manager which disables gradient all-reduce for particular iteration.
327
- # `no_sync()` should applied to first ``N-1`` iterations of gradient
327
+ # `` no_sync()` ` should applied to first ``N-1`` iterations of gradient
328
328
# accumulation, the last iteration should follow the default execution and
329
329
# perform the required gradient all-reduce.
330
330
@@ -333,8 +333,8 @@ def fused_gelu(x):
333
333
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
334
334
# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
335
335
# with ``find_unused_parameters=True`` uses the order of layers and parameters
336
- # from constructors to build buckets for `DistributedDataParallel` gradient
337
- # all-reduce. `DistributedDataParallel` overlaps all-reduce with the backward
336
+ # from constructors to build buckets for `` DistributedDataParallel` ` gradient
337
+ # all-reduce. `` DistributedDataParallel` ` overlaps all-reduce with the backward
338
338
# pass. All-reduce for a particular bucket is asynchronously triggered only when
339
339
# all gradients for parameters in a given bucket are available.
340
340
#
0 commit comments