Merge branch 'main' into issue2346

agunapal · web-flow · commit 89f791c903a8 · 2023-11-14T09:24:18.000-08:00
diff --git a/_static/img/distributed/fsdp_sharding.png b/_static/img/distributed/fsdp_sharding.png
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -29,7 +29,7 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. Please note that this tutorial does not cover
+# causal language modeling task. Please note that this tutorial does not cover
 # the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
 # the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)
@@ -41,8 +41,10 @@
 # Along with the input sequence, a square attention mask is required because the
 # self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
 # the earlier positions in the sequence. For the language modeling task, any
-# tokens on the future positions should be masked. To produce a probability
-# distribution over output words, the output of the ``nn.TransformerEncoder``
+# tokens on the future positions should be masked.  This masking, combined with fact that 
+# the output embeddings are offset with later positions ensures that the
+# predictions for position i can depend only on the known outputs at positions less than i.
+# To produce a probability  distribution over output words, the output of the ``nn.TransformerEncoder``
 # model is passed through a linear layer to output unnormalized logits.
 # The log-softmax function isn't applied here due to the later use of
 # `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__,
@@ -91,6 +93,11 @@ def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         """
         src = self.embedding(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
+        if src_mask is None:
+            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
+            Unmasked positions are filled with float(0.0).
+            """
+            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
         output = self.transformer_encoder(src, src_mask)
         output = self.linear(output)
         return output
diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst
@@ -46,6 +46,15 @@ At a high level FSDP works as follow:
 * Run reduce_scatter to sync gradients
 * Discard parameters. 
 
+One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards.
+
+.. figure:: /_static/img/distributed/fsdp_sharding.png
+   :width: 100%
+   :align: center
+   :alt: FSDP allreduce
+
+   FSDP Allreduce
+
 How to use FSDP
 --------------
 Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. 
diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -178,7 +178,7 @@ def train(data):
 #
 
 ######################################################################
-# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser.
+# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser (**Safari is not supported**).
 #
 # .. code-block::
 #

Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,7 @@ def train(data):`
`178`	`178`	`#`
`179`	`179`
`180`	`180`	`######################################################################`
`181`		`-# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser.`
	`181`	`+# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser (Safari is not supported).`
`182`	`182`	`#`
`183`	`183`	`# .. code-block::`
`184`	`184`	`#`