@@ -333,16 +333,16 @@ def fused_gelu(x):
333
333
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
334
334
# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
335
335
# with ``find_unused_parameters=True`` uses the order of layers and parameters
336
- # from constructors to build buckets for ``DistributedDataParallel`` gradient
337
- # all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the backward
338
- # pass. All-reduce for a particular bucket is asynchronously triggered only when
339
- # all gradients for parameters in a given bucket are available.
340
- #
341
- # To maximize the amount of overlap, the order in constructors should match the
342
- # order during the execution. If the order doesn't match, then all-reduce for
343
- # the entire bucket waits for the gradient which is the last to arrive, this may
344
- # reduce the overlap between backward pass and all-reduce, all-reduce may end up
345
- # being exposed, which slows down the training.
336
+ # from model constructors to build buckets for ``DistributedDataParallel``
337
+ # gradient all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the
338
+ # backward pass. All-reduce for a particular bucket is asynchronously triggered
339
+ # only when all gradients for parameters in a given bucket are available.
340
+ #
341
+ # To maximize the amount of overlap, the order in model constructors should
342
+ # roughly match the order during the execution. If the order doesn't match, then
343
+ # all-reduce for the entire bucket waits for the gradient which is the last to
344
+ # arrive, this may reduce the overlap between backward pass and all-reduce,
345
+ # all-reduce may end up being exposed, which slows down the training.
346
346
#
347
347
# ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is
348
348
# the default setting) relies on automatic bucket formation based on order of
0 commit comments