more things

drisspg · drisspg · commit df5ec8eec75f · 2023-02-28T19:16:05.000Z
diff --git a/beginner_source/scaled_dot_product_attention_tutorial.py b/beginner_source/scaled_dot_product_attention_tutorial.py
@@ -1,6 +1,6 @@
 """
 Create High-Performance Transformer Variations with Scaled Dot Product Attention
-===============================================================
+================================================================================
 
 """
 
@@ -9,17 +9,16 @@
 # Summary
 # ~~~~~~~~
 #
-# In this tutorial we want to highlight a new ``torch.nn.functional`` function
+# In this tutorial, we want to highlight a new ``torch.nn.functional`` function
 # that can be helpful for implementing transformer architectures. The
 # function is named ``torch.nn.functional.scaled_dot_product_attention``.
-# There is some extensive documentation on the function in the `PyTorch
-# documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
+# For detailed description of the function, see the `PyTorch# documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
 # This function has already been incorporated into ``torch.nn.MultiheadAttention`` and ``torch.nn.TransformerEncoderLayer``.
 #
 # Overview
-# ~~~~~~~
-# At a high level this PyTorch function calculates the
-# scaled dot product attention between query, key, and value according to
+# ~~~~~~~~~
+# At a high level, this PyTorch function calculates the
+# scaled dot product attention (SDPA) between query, key, and value according to
 # the definition found in the paper `Attention is all you
 # need <https://arxiv.org/abs/1706.03762>`__. While this function can be
 # written in PyTorch using existing functions, for GPU tensors this
@@ -28,11 +27,11 @@
 # attention mechanisms such as
 # `Linformer <https://arxiv.org/abs/2006.04768>`__
 #
-# Fused implementations:
+# Fused implementations
 # ~~~~~~~~~~~~~~~~~~~~~~
 #
-# For CUDA tensor inputs the function will dispatch into one of three
-# implementations
+# For CUDA tensor inputs, the function will dispatch into one of the following
+# implementations:
 #
 # * `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness <https://arxiv.org/abs/2205.14135>`__
 # * `Memory-Efficient Attention <https://github.com/facebookresearch/xformers>`__
@@ -117,10 +116,10 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
 # ~~~~~~~~~~~~~~~~~~~
 #
 # Depending on what machine you ran the above cell on and what hardware is
-# available your results might be different.
-# - If you don’t have a GPU and are running on CPU then the context manager will have no effect and all
+# available, your results might be different.
+# - If you don’t have a GPU and are running on CPU, then the context manager will have no effect and all
 # are running on CPU then the context manager will have no effect and all
-# three run should return similar timings.
+# three runs should return similar timings.
 #
 
 
@@ -189,7 +188,7 @@ def forward(self, x):
 # NestedTensor and Dense tensor support
 # -------------------------------------
 #
-# Scaled Dot Product Attention supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences
+# SDPA supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences
 # without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensor's see
 # `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ and `NestedTensors Tutorial <https://pytorch.org/tutorials/prototype/nestedtensor.html>`__.
 #
@@ -244,8 +243,8 @@ def generate_rand_batch(
 # Using SDPA with torch.compile
 # ============================
 #
-# Scaled dot product attention is composable with torch.compile(). Lets
-# try compiling the above CausalSelfAttention module
+# Scaled dot product attention is composable with ``torch.compile()``. Let's
+# try compiling the above CausalSelfAttention module:
 #
 
 batch_size = 32
@@ -289,7 +288,7 @@ def generate_rand_batch(
 print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
 
 # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results
-# prof.export_chrome_trace("compiled_causal_attention_trace.json")
+# prof.export_chrome_trace("compiled_causal_attention_trace.json").