@@ -329,20 +329,26 @@ def fused_gelu(x):
329
329
# perform the required gradient all-reduce.
330
330
331
331
###############################################################################
332
- # Match the order of layers in constructors with order during the execution if training with DistributedDataParallel
333
- # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
332
+ # Match the order of layers in constructors and during the execution if using DistributedDataParallel(find_unused_parameters=True)
333
+ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
334
334
# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
335
- # uses the order of layers and parameters from constructors to build buckets for
336
- # `DistributedDataParallel` gradient all-reduce. `DistributedDataParallel`
337
- # overlaps all-reduce with the backward pass. All -reduce for a particular bucket
338
- # is asynchronously triggered only when all gradients for parameters in a given
339
- # bucket are available.
335
+ # with ``find_unused_parameters=True`` uses the order of layers and parameters
336
+ # from constructors to build buckets for `DistributedDataParallel` gradient
337
+ # all-reduce. `DistributedDataParallel` overlaps all -reduce with the backward
338
+ # pass. All-reduce for a particular bucket is asynchronously triggered only when
339
+ # all gradients for parameters in a given bucket are available.
340
340
#
341
341
# To maximize the amount of overlap, the order in constructors should match the
342
342
# order during the execution. If the order doesn't match, then all-reduce for
343
343
# the entire bucket waits for the gradient which is the last to arrive, this may
344
344
# reduce the overlap between backward pass and all-reduce, all-reduce may end up
345
345
# being exposed, which slows down the training.
346
+ #
347
+ # ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is
348
+ # the default setting) relies on automatic bucket formation based on order of
349
+ # operations encountered during the backward pass. With
350
+ # ``find_unused_parameters=False`` it's not necessary to reorder layers or
351
+ # parameters to achieve optimal performance.
346
352
347
353
###############################################################################
348
354
# Load-balance workload in a distributed setting
0 commit comments