more

drisspg · drisspg · commit 3cdd0ecf6fef · 2023-02-27T22:48:38.000Z
diff --git a/beginner_source/scaled_dot_product_attention_tutorial.py b/beginner_source/scaled_dot_product_attention_tutorial.py
@@ -14,7 +14,7 @@
 # function is named ``torch.nn.functional.scaled_dot_product_attention``.
 # There is some extensive documentation on the function in the `PyTorch
 # documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
-# This function has already been incorporated into torch.nn.MultiheadAttention# (Multi-Head Attention) and ``torch.nn.TransformerEncoderLayer``.
+# This function has already been incorporated into ``torch.nn.MultiheadAttention`` and ``torch.nn.TransformerEncoderLayer``.
 #
 # Overview
 # ~~~~~~~
@@ -32,7 +32,8 @@
 # ~~~~~~~~~~~~~~~~~~~~~~
 #
 # For CUDA tensor inputs the function will dispatch into one of three
-# implementations:
+# implementations
+#
 # * `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness <https://arxiv.org/abs/2205.14135>`__
 # * `Memory-Efficient Attention <https://github.com/facebookresearch/xformers>`__
 # * A PyTorch implementation defined in C++
@@ -188,6 +189,10 @@ def forward(self, x):
 # NestedTensor and Dense tensor support
 # -------------------------------------
 #
+# Scaled Dot Product Attention supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences
+# without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensor's see
+# `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ and `NestedTensors Tutorial <https://pytorch.org/tutorials/prototype/nestedtensor.html>`__.
+#
 
 import random
 def generate_rand_batch(