more tweaks

drisspg · drisspg · commit 32a3bba52017 · 2023-03-14T20:29:20.000Z
diff --git a/index.rst b/index.rst
@@ -525,7 +525,7 @@ What's new in PyTorch tutorials?
    :tags: Model-Optimization
 
 .. customcarditem::
-   :header: (beta) Implement High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
+   :header: (beta) Implementing High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
    :card_description: This tutorial explores the new torch.nn.functional.scaled_dot_product_attention and how it can be used to construct Transformer components.
    :image: _static/img/thumbnails/cropped/pytorch-logo.png
    :link: intermediate/scaled_dot_product_attention_tutorial.html
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -1,5 +1,5 @@
 """
-Implement High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
+Implementing High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
 ================================================================================
 
 """
@@ -20,9 +20,9 @@
 # At a high level, this PyTorch function calculates the
 # scaled dot product attention (SDPA) between query, key, and value according to
 # the definition found in the paper `Attention is all you
-# need <https://arxiv.org/abs/1706.03762>`__. While this function can be
-# written in PyTorch using existing functions, for GPU tensors this
-# function will implicitly dispatch to an optimized implementation.
+# need <https://arxiv.org/abs/1706.03762>`__. While this function can
+# be written in PyTorch using existing functions, a fused implementation can provide
+# large performance benefits over a naive implementation.
 #
 # Fused implementations
 # ~~~~~~~~~~~~~~~~~~~~~~
@@ -114,10 +114,10 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
 #
 # Depending on what machine you ran the above cell on and what hardware is
 # available, your results might be different.
-# - If you don’t have a GPU and are running on CPU, then the context manager will have no effect and all
-# are running on CPU then the context manager will have no effect and all
-# three runs should return similar timings.
-#
+# - If you don’t have a GPU and are running on CPU then the context manager
+# will have no effect and all three runs should return similar timings.
+# - Depending on what compute capability your graphics card supports
+# flash attention or memory efficient might have failed.
 
 
 ######################################################################
@@ -186,7 +186,7 @@ def forward(self, x):
 # -------------------------------------
 #
 # SDPA supports both NestedTensor and Dense tensor inputs. NestedTensors handle the case where the input is a batch of variable length sequences
-# without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensor's see
+# without needing to pad each sequence to the maximum length in the batch. For more information about NestedTensors see
 # `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ and `NestedTensors Tutorial <https://pytorch.org/tutorials/prototype/nestedtensor.html>`__.
 #
 
@@ -246,8 +246,12 @@ def generate_rand_batch(
 # Using SDPA with torch.compile
 # ============================
 #
-# Scaled dot product attention is composable with ``torch.compile()``. Let's
-# try compiling the above CausalSelfAttention module:
+# With the release of PyTorch 2.0, a new feature called
+# ``torch.compile()`` has been introduced, which can provide
+# significant performance improvements over eager mode.
+# Scaled dot product attention is fully composable with ``torch.compile()``.
+# To demonstrate this, let's compile the CausalSelfAttention module using
+# ``torch.compile()`` and observe the resulting performance improvements.
 #
 
 batch_size = 32
@@ -304,7 +308,7 @@ def generate_rand_batch(
 # that consumed the most GPU execution time, for both the compiled and non-compiled module.
 # The analysis reveals that the majority of time spent on the GPU is concentrated
 # on the same set of functions for both modules.
-# The problem here is that ``torch.compile`` is very good at removing the
+# The reason for this here is that ``torch.compile`` is very good at removing the
 # framework overhead associated with PyTorch. If your model is launching
 # large, efficient CUDA kernels, which in this case CausaulSelfAttention
 # is, then the overhead of PyTorch can be hidden.