From 215d81e48f5ea8ffd806e4ac426b310b6ef2ea07 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 30 Sep 2024 16:45:23 -0700
Subject: [PATCH 1/5] Regional compilation recipe

---
 recipes_source/recipes/README.txt             |   4 +
 .../recipes/regional_compilation.py           | 162 ++++++++++++++++++
 recipes_source/recipes_index.rst              |  10 ++
 3 files changed, 176 insertions(+)
 create mode 100644 recipes_source/recipes/regional_compilation.py

diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index 18e4d7106b1..ef45bae081c 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -56,3 +56,7 @@ PyTorch Recipes
 14. amp_recipe.py
          Automatic Mixed Precision
          https://pytorch.org/tutorials/recipes/amp_recipe.html
+
+15. regional_compilation.py
+	Reducing torch.compile cold start compilation time with regional compilation
+         https://pytorch.org/tutorials/recipes/regional_compilation.html
diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/recipes/regional_compilation.py
new file mode 100644
index 00000000000..afb867e1763
--- /dev/null
+++ b/recipes_source/recipes/regional_compilation.py
@@ -0,0 +1,162 @@
+"""
+Reducing torch.compile cold start compilation time with regional compilation
+============================================================================
+
+Introduction
+------------
+As deep learning models get larger, the compilation time of these models also
+increase. This increase in compilation time can lead to a large startup time in
+inference services or wasted resources in large scale training. This recipe
+shows an example of how to reduce the cold start compilation time by choosing to
+compile a repeated region of the model instead of the entire model.
+
+Setup
+-----
+Before we begin, we need to install ``torch`` if it is not already
+available.
+
+.. code-block:: sh
+
+   pip install torch
+
+"""
+
+
+
+######################################################################
+# Steps
+# -----
+# 
+# 1. Import all necessary libraries
+# 2. Define and initialize a neural network with repeated regions.
+# 3. Understand the difference between the full model and the regional compilation.
+# 4. Measure the compilation time of the full model and the regional compilation.
+# 
+# 1. Import necessary libraries for loading our data
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# 
+
+import torch
+import torch.nn as nn
+from time import perf_counter
+
+# 
+# 2. Define and initialize a neural network with repeated regions.
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Typically neural networks are composed of repeated layers. For example, a
+# large language model is composed of many Transformer blocks. In this recipe,
+# we will create a `Layer` `nn.Module` class as a proxy for a repeated region.
+# We will then create a `Model` which is composed of 64 instances of this
+# `Layer` class.
+# 
+class Layer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(10, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        a = self.linear1(x)
+        a = self.relu1(a)
+        a = torch.sigmoid(a)
+        b = self.linear2(a)
+        b = self.relu2(b)
+        return b
+
+class Model(torch.nn.Module):
+    def __init__(self, apply_regional_compilation):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        # Apply compile only to the repeated layers.
+        if apply_regional_compilation:
+            self.layers = torch.nn.ModuleList([torch.compile(Layer()) for _ in range(64)])
+        else:
+            self.layers = torch.nn.ModuleList([Layer() for _ in range(64)])
+
+    def forward(self, x):
+        # In regional compilation, the self.linear is outside of the scope of `torch.compile`.
+        x = self.linear(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+#
+# 3. Understand the difference between the full model and the regional compilation.
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# In full model compilation, the full model is compiled as a whole. This is how
+# most users use torch.compile. In this example, we can apply torch.compile to
+# the `model` object. This will effectively inline the 64 layers, producing a
+# large graph to compile. You can look at the full graph by running this recipe
+# with `TORCH_LOGS=graph_code`.
+# 
+#
+
+model = Model(apply_regional_compilation=False).cuda()
+full_compiled_model = torch.compile(model)
+
+
+# 
+# The regional compilation, on the other hand, compiles a region of the model.
+# By wisely choosing to compile a repeated region of the model, we can compile a
+# much smaller graph and then reuse the compiled graph for all the regions. We
+# can apply regional compilation in the example as follows. `torch.compile` is
+# applied only to the `layers` and not the full model.
+# 
+
+regional_compiled_model = Model(apply_regional_compilation=True).cuda()
+
+# Applying compilation to a repeated region, instead of full model, leads to
+# large savings in compile time. Here, we will just compile a layer instance and
+# then reuse it 64 times in the `model` object.
+# 
+# Note that with repeated regions, some part of the model might not be compiled.
+# For example, the `self.linear` in the `Model` is outside of the scope of
+# regional compilation.
+# 
+# Also, note that there is a tradeoff between performance speedup and compile
+# time.  The full model compilation has larger graph and therefore,
+# theoretically, has more scope for optimizations. However for practical
+# purposes and depending on the model, we have observed many cases with minimal
+# speedup differences between the full model and regional compilation.
+
+
+#
+# 4. Measure the compilation time of the full model and the regional compilation.
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.compile` is a JIT compiler, i.e., it compiles on the first invocation.
+# Here, we measure the total time spent in the first invocation. This is not
+# precise, but it gives a good idea because the majority of time is spent in
+# compilation.
+
+def measure_latency(fn, input):
+    # Reset the compiler caches to ensure no reuse between different runs
+    torch.compiler.reset()
+    with torch._inductor.utils.fresh_inductor_cache():
+        start = perf_counter()
+        fn(input)
+        torch.cuda.synchronize()
+        end = perf_counter()
+        return end - start
+
+input = torch.randn(10, 10, device="cuda")
+full_model_compilation_latency = measure_latency(full_compiled_model, input)
+print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds")
+
+regional_compilation_latency = measure_latency(regional_compiled_model, input)
+print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")
+
+############################################################################
+# This recipe shows how to control the cold start compilation time if your model
+# has repeated regions. This requires user changes to apply `torch.compile` to
+# the repeated regions instead of more commonly used full model compilation. We
+# are continually working on reducing cold start compilation time. So, please
+# stay tuned for our next tutorials.
+# 
+# This feature is available with 2.5 release. If you are on 2.4, you can use a
+# config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid
+# recompilations on the regional compilation. In 2.5, this flag is turned on by
+# default.
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index caccdcc28f7..5d24b328f73 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -339,6 +339,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/torch_compile_caching_tutorial.html
    :tags: Model-Optimization
 
+.. Reducing Cold Start Compilation Time with Regional Compilation
+
+.. customcarditem::
+   :header: Reducing torch.compile cold start compilation time with regional compilation
+   :card_description: Learn how to use regional compilation to control cold start compile time
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../recipes/recipes/regional_compilation.html
+   :tags: Model-Optimization
+
 .. Intel(R) Extension for PyTorch*
 
 .. customcarditem::
@@ -452,6 +461,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/amp_recipe
    /recipes/recipes/tuning_guide
    /recipes/recipes/xeon_run_cpu
+   /recipes/recipes/regional_compilation
    /recipes/recipes/intel_extension_for_pytorch
    /recipes/compiling_optimizer
    /recipes/torch_compile_backend_ipex

From 52fe9489e3068f870ff987bd4ecdb43c8e5148aa Mon Sep 17 00:00:00 2001
From: Animesh Jain <jainanimesh2305@yahoo.com>
Date: Tue, 1 Oct 2024 10:41:17 -0700
Subject: [PATCH 2/5] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 .../recipes/regional_compilation.py           | 83 ++++++++++---------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/recipes/regional_compilation.py
index afb867e1763..8641e8cdf24 100644
--- a/recipes_source/recipes/regional_compilation.py
+++ b/recipes_source/recipes/regional_compilation.py
@@ -2,14 +2,17 @@
 Reducing torch.compile cold start compilation time with regional compilation
 ============================================================================
 
-Introduction
-------------
+**Author:** `Animesh Jain <https://github.com/anijain2305>`_
 As deep learning models get larger, the compilation time of these models also
-increase. This increase in compilation time can lead to a large startup time in
-inference services or wasted resources in large scale training. This recipe
+increases. This extended compilation time can result in a large startup time in
+inference services or wasted resources in large-scale training. This recipe
 shows an example of how to reduce the cold start compilation time by choosing to
 compile a repeated region of the model instead of the entire model.
 
+Prerequisites
+----------------
+
+* Pytorch 2.5 or later
 Setup
 -----
 Before we begin, we need to install ``torch`` if it is not already
@@ -19,6 +22,10 @@
 
    pip install torch
 
+.. note:: 
+   This feature is available starting with the 2.5 release. If you are using version 2.4,
+   you can enable the configuration flag ``torch._dynamo.config.inline_inbuilt_nn_modules=True``
+   to prevent recompilations during regional compilation. In version 2.5, this flag is enabled by default.
 """
 
 
@@ -27,13 +34,15 @@
 # Steps
 # -----
 # 
-# 1. Import all necessary libraries
+# In this recipe, we will follow these steps:
+#
+# 1. Import all necessary libraries.
 # 2. Define and initialize a neural network with repeated regions.
 # 3. Understand the difference between the full model and the regional compilation.
 # 4. Measure the compilation time of the full model and the regional compilation.
 # 
-# 1. Import necessary libraries for loading our data
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# First, let's import the necessary libraries for loading our data: 
+# 
 # 
 # 
 
@@ -41,14 +50,14 @@
 import torch.nn as nn
 from time import perf_counter
 
+##########################################################
+# Next, let's define and initialize a neural network with repeated regions.
 # 
-# 2. Define and initialize a neural network with repeated regions.
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Typically neural networks are composed of repeated layers. For example, a
+# Typically, neural networks are composed of repeated layers. For example, a
 # large language model is composed of many Transformer blocks. In this recipe,
-# we will create a `Layer` `nn.Module` class as a proxy for a repeated region.
-# We will then create a `Model` which is composed of 64 instances of this
-# `Layer` class.
+# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region.
+# We will then create a ``Model`` which is composed of 64 instances of this
+# ``Layer`` class.
 # 
 class Layer(torch.nn.Module):
     def __init__(self):
@@ -83,15 +92,14 @@ def forward(self, x):
             x = layer(x)
         return x
 
-#
-# 3. Understand the difference between the full model and the regional compilation.
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+####################################################
+# Next, let's review the difference between the full model and the regional compilation.
 # 
-# In full model compilation, the full model is compiled as a whole. This is how
-# most users use torch.compile. In this example, we can apply torch.compile to
-# the `model` object. This will effectively inline the 64 layers, producing a
+# In full model compilation, the entire model is compiled as a whole. This is the common approach 
+# most users take with ``torch.compile``. In this example, we apply ``torch.compile`` to
+# the ``Model`` object. This will effectively inline the 64 layers, producing a
 # large graph to compile. You can look at the full graph by running this recipe
-# with `TORCH_LOGS=graph_code`.
+# with ``TORCH_LOGS=graph_code``.
 # 
 #
 
@@ -99,37 +107,36 @@ def forward(self, x):
 full_compiled_model = torch.compile(model)
 
 
-# 
+###################################################
 # The regional compilation, on the other hand, compiles a region of the model.
-# By wisely choosing to compile a repeated region of the model, we can compile a
-# much smaller graph and then reuse the compiled graph for all the regions. We
-# can apply regional compilation in the example as follows. `torch.compile` is
-# applied only to the `layers` and not the full model.
+# By strategically choosing to compile a repeated region of the model, we can compile a
+# much smaller graph and then reuse the compiled graph for all the regions.
+# In the example, ``torch.compile`` is applied only to the ``layers`` and not the full model.
 # 
 
 regional_compiled_model = Model(apply_regional_compilation=True).cuda()
 
 # Applying compilation to a repeated region, instead of full model, leads to
 # large savings in compile time. Here, we will just compile a layer instance and
-# then reuse it 64 times in the `model` object.
+# then reuse it 64 times in the ``Model`` object.
 # 
 # Note that with repeated regions, some part of the model might not be compiled.
-# For example, the `self.linear` in the `Model` is outside of the scope of
+# For example, the ``self.linear`` in the ``Model`` is outside of the scope of
 # regional compilation.
 # 
 # Also, note that there is a tradeoff between performance speedup and compile
-# time.  The full model compilation has larger graph and therefore,
-# theoretically, has more scope for optimizations. However for practical
+# time. Full model compilation involves a larger graph and,
+# theoretically, offers more scope for optimizations. However, for practical
 # purposes and depending on the model, we have observed many cases with minimal
 # speedup differences between the full model and regional compilation.
 
 
+###################################################
+# Next, let's measure the compilation time of the full model and the regional compilation.
 #
-# 4. Measure the compilation time of the full model and the regional compilation.
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# `torch.compile` is a JIT compiler, i.e., it compiles on the first invocation.
-# Here, we measure the total time spent in the first invocation. This is not
-# precise, but it gives a good idea because the majority of time is spent in
+# ``torch.compile`` is a JIT compiler, which means that it compiles on the first invocation.
+# In the code below, we measure the total time spent in the first invocation. While this method is not 
+# precise, it provides a good estimate since the majority of the time is spent in
 # compilation.
 
 def measure_latency(fn, input):
@@ -150,11 +157,13 @@ def measure_latency(fn, input):
 print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds")
 
 ############################################################################
+# Conclusion
+# -----------
+#
 # This recipe shows how to control the cold start compilation time if your model
-# has repeated regions. This requires user changes to apply `torch.compile` to
+# has repeated regions. This approach requires user modifications to apply `torch.compile` to
 # the repeated regions instead of more commonly used full model compilation. We
-# are continually working on reducing cold start compilation time. So, please
-# stay tuned for our next tutorials.
+# are continually working on reducing cold start compilation time.
 # 
 # This feature is available with 2.5 release. If you are on 2.4, you can use a
 # config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid

From 937ae75c29b66192e43e6bdbdb21c54e11bb26d6 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 1 Oct 2024 11:07:18 -0700
Subject: [PATCH 3/5] Move the file to recipes_source and change runner

---
 .jenkins/metadata.json                               | 3 +++
 recipes_source/recipes_index.rst                     | 2 +-
 recipes_source/{recipes => }/regional_compilation.py | 0
 3 files changed, 4 insertions(+), 1 deletion(-)
 rename recipes_source/{recipes => }/regional_compilation.py (100%)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 2f1a9933aab..28829868d45 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -58,6 +58,9 @@
   "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },
+  "recipes_source/regional_compilation.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
   "advanced_source/semi_structured_sparse.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 5d24b328f73..7f118df7b24 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -345,7 +345,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :header: Reducing torch.compile cold start compilation time with regional compilation
    :card_description: Learn how to use regional compilation to control cold start compile time
    :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
-   :link: ../recipes/recipes/regional_compilation.html
+   :link: ../recipes/regional_compilation.html
    :tags: Model-Optimization
 
 .. Intel(R) Extension for PyTorch*
diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/regional_compilation.py
similarity index 100%
rename from recipes_source/recipes/regional_compilation.py
rename to recipes_source/regional_compilation.py

From 05bee4b97f350110cb7fe047ddda455a9d8fe926 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 1 Oct 2024 11:16:34 -0700
Subject: [PATCH 4/5] Remove toctree line

---
 recipes_source/recipes_index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 7f118df7b24..7d6a067b7f3 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -461,7 +461,6 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/amp_recipe
    /recipes/recipes/tuning_guide
    /recipes/recipes/xeon_run_cpu
-   /recipes/recipes/regional_compilation
    /recipes/recipes/intel_extension_for_pytorch
    /recipes/compiling_optimizer
    /recipes/torch_compile_backend_ipex

From 84f11aa75239a57b0b85af919b555dcab4bfcd5a Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 1 Oct 2024 11:26:41 -0700
Subject: [PATCH 5/5] Formatting cleanup

---
 recipes_source/regional_compilation.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/recipes_source/regional_compilation.py b/recipes_source/regional_compilation.py
index 8641e8cdf24..f0f0e7f3e3d 100644
--- a/recipes_source/regional_compilation.py
+++ b/recipes_source/regional_compilation.py
@@ -13,6 +13,7 @@
 ----------------
 
 * Pytorch 2.5 or later
+
 Setup
 -----
 Before we begin, we need to install ``torch`` if it is not already
@@ -116,6 +117,7 @@ def forward(self, x):
 
 regional_compiled_model = Model(apply_regional_compilation=True).cuda()
 
+#####################################################
 # Applying compilation to a repeated region, instead of full model, leads to
 # large savings in compile time. Here, we will just compile a layer instance and
 # then reuse it 64 times in the ``Model`` object.
@@ -165,7 +167,3 @@ def measure_latency(fn, input):
 # the repeated regions instead of more commonly used full model compilation. We
 # are continually working on reducing cold start compilation time.
 # 
-# This feature is available with 2.5 release. If you are on 2.4, you can use a
-# config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid
-# recompilations on the regional compilation. In 2.5, this flag is turned on by
-# default.