|
48 | 48 | # Disable bias for convolutions directly followed by a batch norm
|
49 | 49 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
50 | 50 | # `torch.nn.Conv2d() <https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d>`_
|
51 |
| -# has ``bias`` parameter which defaults to ``True`` (same is true for |
| 51 | +# has ``bias`` parameter which defaults to ``True`` (the same is true for |
52 | 52 | # `Conv1d <https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d>`_
|
53 | 53 | # and
|
54 | 54 | # `Conv3d <https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html#torch.nn.Conv3d>`_
|
|
84 | 84 | ###############################################################################
|
85 | 85 | # The second code snippet does not zero the memory of each individual parameter,
|
86 | 86 | # also the subsequent backward pass uses assignment instead of addition to store
|
87 |
| -# gradients, this reduces number of memory operations. |
| 87 | +# gradients, this reduces the number of memory operations. |
88 | 88 | #
|
89 | 89 | # Setting gradient to ``None`` has a slightly different numerical behavior than
|
90 | 90 | # setting it to zero, for more details refer to the
|
@@ -324,7 +324,7 @@ def fused_gelu(x):
|
324 | 324 | # ``DistributedDataParallel`` provides
|
325 | 325 | # `no_sync() <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync>`_
|
326 | 326 | # context manager which disables gradient all-reduce for particular iteration.
|
327 |
| -# ``no_sync()`` should applied to first ``N-1`` iterations of gradient |
| 327 | +# ``no_sync()`` should be applied to first ``N-1`` iterations of gradient |
328 | 328 | # accumulation, the last iteration should follow the default execution and
|
329 | 329 | # perform the required gradient all-reduce.
|
330 | 330 |
|
|
0 commit comments