From 215d81e48f5ea8ffd806e4ac426b310b6ef2ea07 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Mon, 30 Sep 2024 16:45:23 -0700 Subject: [PATCH 1/5] Regional compilation recipe --- recipes_source/recipes/README.txt | 4 + .../recipes/regional_compilation.py | 162 ++++++++++++++++++ recipes_source/recipes_index.rst | 10 ++ 3 files changed, 176 insertions(+) create mode 100644 recipes_source/recipes/regional_compilation.py diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt index 18e4d7106b1..ef45bae081c 100644 --- a/recipes_source/recipes/README.txt +++ b/recipes_source/recipes/README.txt @@ -56,3 +56,7 @@ PyTorch Recipes 14. amp_recipe.py Automatic Mixed Precision https://pytorch.org/tutorials/recipes/amp_recipe.html + +15. regional_compilation.py + Reducing torch.compile cold start compilation time with regional compilation + https://pytorch.org/tutorials/recipes/regional_compilation.html diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/recipes/regional_compilation.py new file mode 100644 index 00000000000..afb867e1763 --- /dev/null +++ b/recipes_source/recipes/regional_compilation.py @@ -0,0 +1,162 @@ +""" +Reducing torch.compile cold start compilation time with regional compilation +============================================================================ + +Introduction +------------ +As deep learning models get larger, the compilation time of these models also +increase. This increase in compilation time can lead to a large startup time in +inference services or wasted resources in large scale training. This recipe +shows an example of how to reduce the cold start compilation time by choosing to +compile a repeated region of the model instead of the entire model. + +Setup +----- +Before we begin, we need to install ``torch`` if it is not already +available. + +.. code-block:: sh + + pip install torch + +""" + + + +###################################################################### +# Steps +# ----- +# +# 1. Import all necessary libraries +# 2. Define and initialize a neural network with repeated regions. +# 3. Understand the difference between the full model and the regional compilation. +# 4. Measure the compilation time of the full model and the regional compilation. +# +# 1. Import necessary libraries for loading our data +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# + +import torch +import torch.nn as nn +from time import perf_counter + +# +# 2. Define and initialize a neural network with repeated regions. +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Typically neural networks are composed of repeated layers. For example, a +# large language model is composed of many Transformer blocks. In this recipe, +# we will create a `Layer` `nn.Module` class as a proxy for a repeated region. +# We will then create a `Model` which is composed of 64 instances of this +# `Layer` class. +# +class Layer(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(10, 10) + self.relu1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(10, 10) + self.relu2 = torch.nn.ReLU() + + def forward(self, x): + a = self.linear1(x) + a = self.relu1(a) + a = torch.sigmoid(a) + b = self.linear2(a) + b = self.relu2(b) + return b + +class Model(torch.nn.Module): + def __init__(self, apply_regional_compilation): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + # Apply compile only to the repeated layers. + if apply_regional_compilation: + self.layers = torch.nn.ModuleList([torch.compile(Layer()) for _ in range(64)]) + else: + self.layers = torch.nn.ModuleList([Layer() for _ in range(64)]) + + def forward(self, x): + # In regional compilation, the self.linear is outside of the scope of `torch.compile`. + x = self.linear(x) + for layer in self.layers: + x = layer(x) + return x + +# +# 3. Understand the difference between the full model and the regional compilation. +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In full model compilation, the full model is compiled as a whole. This is how +# most users use torch.compile. In this example, we can apply torch.compile to +# the `model` object. This will effectively inline the 64 layers, producing a +# large graph to compile. You can look at the full graph by running this recipe +# with `TORCH_LOGS=graph_code`. +# +# + +model = Model(apply_regional_compilation=False).cuda() +full_compiled_model = torch.compile(model) + + +# +# The regional compilation, on the other hand, compiles a region of the model. +# By wisely choosing to compile a repeated region of the model, we can compile a +# much smaller graph and then reuse the compiled graph for all the regions. We +# can apply regional compilation in the example as follows. `torch.compile` is +# applied only to the `layers` and not the full model. +# + +regional_compiled_model = Model(apply_regional_compilation=True).cuda() + +# Applying compilation to a repeated region, instead of full model, leads to +# large savings in compile time. Here, we will just compile a layer instance and +# then reuse it 64 times in the `model` object. +# +# Note that with repeated regions, some part of the model might not be compiled. +# For example, the `self.linear` in the `Model` is outside of the scope of +# regional compilation. +# +# Also, note that there is a tradeoff between performance speedup and compile +# time. The full model compilation has larger graph and therefore, +# theoretically, has more scope for optimizations. However for practical +# purposes and depending on the model, we have observed many cases with minimal +# speedup differences between the full model and regional compilation. + + +# +# 4. Measure the compilation time of the full model and the regional compilation. +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# `torch.compile` is a JIT compiler, i.e., it compiles on the first invocation. +# Here, we measure the total time spent in the first invocation. This is not +# precise, but it gives a good idea because the majority of time is spent in +# compilation. + +def measure_latency(fn, input): + # Reset the compiler caches to ensure no reuse between different runs + torch.compiler.reset() + with torch._inductor.utils.fresh_inductor_cache(): + start = perf_counter() + fn(input) + torch.cuda.synchronize() + end = perf_counter() + return end - start + +input = torch.randn(10, 10, device="cuda") +full_model_compilation_latency = measure_latency(full_compiled_model, input) +print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds") + +regional_compilation_latency = measure_latency(regional_compiled_model, input) +print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds") + +############################################################################ +# This recipe shows how to control the cold start compilation time if your model +# has repeated regions. This requires user changes to apply `torch.compile` to +# the repeated regions instead of more commonly used full model compilation. We +# are continually working on reducing cold start compilation time. So, please +# stay tuned for our next tutorials. +# +# This feature is available with 2.5 release. If you are on 2.4, you can use a +# config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid +# recompilations on the regional compilation. In 2.5, this flag is turned on by +# default. diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index caccdcc28f7..5d24b328f73 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -339,6 +339,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/torch_compile_caching_tutorial.html :tags: Model-Optimization +.. Reducing Cold Start Compilation Time with Regional Compilation + +.. customcarditem:: + :header: Reducing torch.compile cold start compilation time with regional compilation + :card_description: Learn how to use regional compilation to control cold start compile time + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/recipes/regional_compilation.html + :tags: Model-Optimization + .. Intel(R) Extension for PyTorch* .. customcarditem:: @@ -452,6 +461,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/amp_recipe /recipes/recipes/tuning_guide /recipes/recipes/xeon_run_cpu + /recipes/recipes/regional_compilation /recipes/recipes/intel_extension_for_pytorch /recipes/compiling_optimizer /recipes/torch_compile_backend_ipex From 52fe9489e3068f870ff987bd4ecdb43c8e5148aa Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Tue, 1 Oct 2024 10:41:17 -0700 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- .../recipes/regional_compilation.py | 83 ++++++++++--------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/recipes/regional_compilation.py index afb867e1763..8641e8cdf24 100644 --- a/recipes_source/recipes/regional_compilation.py +++ b/recipes_source/recipes/regional_compilation.py @@ -2,14 +2,17 @@ Reducing torch.compile cold start compilation time with regional compilation ============================================================================ -Introduction ------------- +**Author:** `Animesh Jain `_ As deep learning models get larger, the compilation time of these models also -increase. This increase in compilation time can lead to a large startup time in -inference services or wasted resources in large scale training. This recipe +increases. This extended compilation time can result in a large startup time in +inference services or wasted resources in large-scale training. This recipe shows an example of how to reduce the cold start compilation time by choosing to compile a repeated region of the model instead of the entire model. +Prerequisites +---------------- + +* Pytorch 2.5 or later Setup ----- Before we begin, we need to install ``torch`` if it is not already @@ -19,6 +22,10 @@ pip install torch +.. note:: + This feature is available starting with the 2.5 release. If you are using version 2.4, + you can enable the configuration flag ``torch._dynamo.config.inline_inbuilt_nn_modules=True`` + to prevent recompilations during regional compilation. In version 2.5, this flag is enabled by default. """ @@ -27,13 +34,15 @@ # Steps # ----- # -# 1. Import all necessary libraries +# In this recipe, we will follow these steps: +# +# 1. Import all necessary libraries. # 2. Define and initialize a neural network with repeated regions. # 3. Understand the difference between the full model and the regional compilation. # 4. Measure the compilation time of the full model and the regional compilation. # -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# First, let's import the necessary libraries for loading our data: +# # # @@ -41,14 +50,14 @@ import torch.nn as nn from time import perf_counter +########################################################## +# Next, let's define and initialize a neural network with repeated regions. # -# 2. Define and initialize a neural network with repeated regions. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Typically neural networks are composed of repeated layers. For example, a +# Typically, neural networks are composed of repeated layers. For example, a # large language model is composed of many Transformer blocks. In this recipe, -# we will create a `Layer` `nn.Module` class as a proxy for a repeated region. -# We will then create a `Model` which is composed of 64 instances of this -# `Layer` class. +# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region. +# We will then create a ``Model`` which is composed of 64 instances of this +# ``Layer`` class. # class Layer(torch.nn.Module): def __init__(self): @@ -83,15 +92,14 @@ def forward(self, x): x = layer(x) return x -# -# 3. Understand the difference between the full model and the regional compilation. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#################################################### +# Next, let's review the difference between the full model and the regional compilation. # -# In full model compilation, the full model is compiled as a whole. This is how -# most users use torch.compile. In this example, we can apply torch.compile to -# the `model` object. This will effectively inline the 64 layers, producing a +# In full model compilation, the entire model is compiled as a whole. This is the common approach +# most users take with ``torch.compile``. In this example, we apply ``torch.compile`` to +# the ``Model`` object. This will effectively inline the 64 layers, producing a # large graph to compile. You can look at the full graph by running this recipe -# with `TORCH_LOGS=graph_code`. +# with ``TORCH_LOGS=graph_code``. # # @@ -99,37 +107,36 @@ def forward(self, x): full_compiled_model = torch.compile(model) -# +################################################### # The regional compilation, on the other hand, compiles a region of the model. -# By wisely choosing to compile a repeated region of the model, we can compile a -# much smaller graph and then reuse the compiled graph for all the regions. We -# can apply regional compilation in the example as follows. `torch.compile` is -# applied only to the `layers` and not the full model. +# By strategically choosing to compile a repeated region of the model, we can compile a +# much smaller graph and then reuse the compiled graph for all the regions. +# In the example, ``torch.compile`` is applied only to the ``layers`` and not the full model. # regional_compiled_model = Model(apply_regional_compilation=True).cuda() # Applying compilation to a repeated region, instead of full model, leads to # large savings in compile time. Here, we will just compile a layer instance and -# then reuse it 64 times in the `model` object. +# then reuse it 64 times in the ``Model`` object. # # Note that with repeated regions, some part of the model might not be compiled. -# For example, the `self.linear` in the `Model` is outside of the scope of +# For example, the ``self.linear`` in the ``Model`` is outside of the scope of # regional compilation. # # Also, note that there is a tradeoff between performance speedup and compile -# time. The full model compilation has larger graph and therefore, -# theoretically, has more scope for optimizations. However for practical +# time. Full model compilation involves a larger graph and, +# theoretically, offers more scope for optimizations. However, for practical # purposes and depending on the model, we have observed many cases with minimal # speedup differences between the full model and regional compilation. +################################################### +# Next, let's measure the compilation time of the full model and the regional compilation. # -# 4. Measure the compilation time of the full model and the regional compilation. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# `torch.compile` is a JIT compiler, i.e., it compiles on the first invocation. -# Here, we measure the total time spent in the first invocation. This is not -# precise, but it gives a good idea because the majority of time is spent in +# ``torch.compile`` is a JIT compiler, which means that it compiles on the first invocation. +# In the code below, we measure the total time spent in the first invocation. While this method is not +# precise, it provides a good estimate since the majority of the time is spent in # compilation. def measure_latency(fn, input): @@ -150,11 +157,13 @@ def measure_latency(fn, input): print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds") ############################################################################ +# Conclusion +# ----------- +# # This recipe shows how to control the cold start compilation time if your model -# has repeated regions. This requires user changes to apply `torch.compile` to +# has repeated regions. This approach requires user modifications to apply `torch.compile` to # the repeated regions instead of more commonly used full model compilation. We -# are continually working on reducing cold start compilation time. So, please -# stay tuned for our next tutorials. +# are continually working on reducing cold start compilation time. # # This feature is available with 2.5 release. If you are on 2.4, you can use a # config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid From 937ae75c29b66192e43e6bdbdb21c54e11bb26d6 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Tue, 1 Oct 2024 11:07:18 -0700 Subject: [PATCH 3/5] Move the file to recipes_source and change runner --- .jenkins/metadata.json | 3 +++ recipes_source/recipes_index.rst | 2 +- recipes_source/{recipes => }/regional_compilation.py | 0 3 files changed, 4 insertions(+), 1 deletion(-) rename recipes_source/{recipes => }/regional_compilation.py (100%) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 2f1a9933aab..28829868d45 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -58,6 +58,9 @@ "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, + "recipes_source/regional_compilation.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "advanced_source/semi_structured_sparse.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 5d24b328f73..7f118df7b24 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -345,7 +345,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :header: Reducing torch.compile cold start compilation time with regional compilation :card_description: Learn how to use regional compilation to control cold start compile time :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: ../recipes/recipes/regional_compilation.html + :link: ../recipes/regional_compilation.html :tags: Model-Optimization .. Intel(R) Extension for PyTorch* diff --git a/recipes_source/recipes/regional_compilation.py b/recipes_source/regional_compilation.py similarity index 100% rename from recipes_source/recipes/regional_compilation.py rename to recipes_source/regional_compilation.py From 05bee4b97f350110cb7fe047ddda455a9d8fe926 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Tue, 1 Oct 2024 11:16:34 -0700 Subject: [PATCH 4/5] Remove toctree line --- recipes_source/recipes_index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 7f118df7b24..7d6a067b7f3 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -461,7 +461,6 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/amp_recipe /recipes/recipes/tuning_guide /recipes/recipes/xeon_run_cpu - /recipes/recipes/regional_compilation /recipes/recipes/intel_extension_for_pytorch /recipes/compiling_optimizer /recipes/torch_compile_backend_ipex From 84f11aa75239a57b0b85af919b555dcab4bfcd5a Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 1 Oct 2024 11:26:41 -0700 Subject: [PATCH 5/5] Formatting cleanup --- recipes_source/regional_compilation.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes_source/regional_compilation.py b/recipes_source/regional_compilation.py index 8641e8cdf24..f0f0e7f3e3d 100644 --- a/recipes_source/regional_compilation.py +++ b/recipes_source/regional_compilation.py @@ -13,6 +13,7 @@ ---------------- * Pytorch 2.5 or later + Setup ----- Before we begin, we need to install ``torch`` if it is not already @@ -116,6 +117,7 @@ def forward(self, x): regional_compiled_model = Model(apply_regional_compilation=True).cuda() +##################################################### # Applying compilation to a repeated region, instead of full model, leads to # large savings in compile time. Here, we will just compile a layer instance and # then reuse it 64 times in the ``Model`` object. @@ -165,7 +167,3 @@ def measure_latency(fn, input): # the repeated regions instead of more commonly used full model compilation. We # are continually working on reducing cold start compilation time. # -# This feature is available with 2.5 release. If you are on 2.4, you can use a -# config flag - `torch._dynamo.config.inline_inbuilt_nn_modules=True` to avoid -# recompilations on the regional compilation. In 2.5, this flag is turned on by -# default.