updates

drisspg · drisspg · commit 6c67d6ddfad0 · 2023-03-02T23:05:23.000Z
diff --git a/index.rst b/index.rst
@@ -528,7 +528,7 @@ What's new in PyTorch tutorials?
    :header: (beta) Implement High-Performance Transformers with SCALED DOT PRODUCT ATTENTION
    :card_description: This tutorial explores the new torch.nn.functional.scaled_dot_product_attention and how it can be used to construct Transformer components.
    :image: _static/img/thumbnails/cropped/pytorch-logo.png
-   :link: beginner/scaled_dot_product_attention_tutorial.html
+   :link: intermediate/scaled_dot_product_attention_tutorial.html
    :tags: Model-Optimization,Attention,Transformer
 
 .. Parallel-and-Distributed-Training
@@ -916,7 +916,7 @@ Additional Resources
    intermediate/nvfuser_intro_tutorial
    intermediate/ax_multiobjective_nas_tutorial
    intermediate/torch_compile_tutorial
-   beginner/scaled_dot_product_attention_tutorial
+   intermediate/scaled_dot_product_attention_tutorial
 
 .. toctree::
    :maxdepth: 2
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -12,7 +12,7 @@
 # In this tutorial, we want to highlight a new ``torch.nn.functional`` function
 # that can be helpful for implementing transformer architectures. The
 # function is named ``torch.nn.functional.scaled_dot_product_attention``.
-# For detailed description of the function, see the `PyTorch# documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
+# For detailed description of the function, see the `PyTorch documentation <https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html#torch.nn.functional.scaled_dot_product_attention>`__.
 # This function has already been incorporated into ``torch.nn.MultiheadAttention`` and ``torch.nn.TransformerEncoderLayer``.
 #
 # Overview
@@ -22,10 +22,7 @@
 # the definition found in the paper `Attention is all you
 # need <https://arxiv.org/abs/1706.03762>`__. While this function can be
 # written in PyTorch using existing functions, for GPU tensors this
-# function will implicitly dispatch to an optimized implementation. The
-# function is also highly modular and can be used to implement other
-# attention mechanisms such as
-# `Linformer <https://arxiv.org/abs/2006.04768>`__
+# function will implicitly dispatch to an optimized implementation.
 #
 # Fused implementations
 # ~~~~~~~~~~~~~~~~~~~~~~
@@ -234,7 +231,7 @@ def generate_rand_batch(
 # Currently the fastpaths don't support NestedTensor for training
 random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device)
 random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device)
-model.requires_grad_(False)
+model.eval()
 print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds")
 print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds")
 
@@ -256,14 +253,17 @@ def generate_rand_batch(
 
 
 compiled_model = torch.compile(model)
-# Lets warm it up once
+# Let's compile it
 compiled_model(x)
 print(
     f"The compiled module runs in  {benchmark_torch_function_in_microseconds(compiled_model, x):.3f} microseconds")
 
 
 ######################################################################
 #
+# The exact execution time is dependent on machine, however the results for mine:
+# The non compiled module runs in  166.616 microseconds
+# The compiled module runs in  166.726 microseconds
 # That is not what we were expecting. Let's dig a little deeper.
 # PyTorch comes with an amazing built-in profiler that you can use to
 # inspect the performance characteristics of your code.
@@ -278,14 +278,14 @@ def generate_rand_batch(
     with record_function(" Non-Compilied Causal Attention"):
         for _ in range(25):
             model(x)
-print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
 
 
 with profile(activities=activities, record_shapes=False) as prof:
     with record_function("Compiled Causal Attention"):
         for _ in range(25):
             compiled_model(x)
-print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
 
 # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results
 # prof.export_chrome_trace("compiled_causal_attention_trace.json").
@@ -294,16 +294,21 @@ def generate_rand_batch(
 
 
 ######################################################################
+# The previous code snippet generates a report of the top 10 PyTorch functions
+# that consumed the most GPU execution time, for both the compiled and non-compiled module.
+# The analysis reveals that the majority of time spent on the GPU is concentrated
+# on the same set of functions for both modules.
 # The problem here is that ``torch.compile`` is very good at removing the
 # framework overhead associated with PyTorch. If your model is launching
 # large, efficient CUDA kernels, which in this case CausaulSelfAttention
-# is, then the overhead of ``torch.compile`` can hurt performance.
+# is, then the overhead of PyTorch can be hidden.
 #
 # In reality, your module does not normally consist of a singular
 # CausalSelfAttention block. When experimenting with Andrej Karpathy’s
 # `NanoGPT <https://github.com/karpathy/nanoGPT>`__ repository, compiling
-# the module took the time per train step from: ``902.01ms`` to
-# ``552.06ms``!
+# the module took the time per train step from: ``6090.49ms`` to
+# ``3273.17ms``! This was done on commit: ae3a8d5 of NanoGPT training on
+# the shakespeare dataset.
 #