From 1dea27833a1b2d0144cde38b9b0b0e8c3aaafef6 Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 12 Aug 2024 23:32:22 +0000 Subject: [PATCH 01/21] Tutorial for AOTI Python runtime --- .../torch_export_aoti_python.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 intermediate_source/torch_export_aoti_python.py diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py new file mode 100644 index 00000000000..48168dcef73 --- /dev/null +++ b/intermediate_source/torch_export_aoti_python.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +""" +torch.export AOT Inductor Tutorial for Python runtime +=================================================== +**Author:** Ankith Gunapal +""" + +###################################################################### +# +# .. warning:: +# +# ``torch._export.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility +# breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime. +# +# It has been shown `previously `__ how AOTInductor can be used +# to do Ahead-of-Time compilation of PyTorch exported models by creating +# a shared library that can be run in a non-Python environment. +# +# +# In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime. +# We will look at how to use :func:`torch._export.aot_compile` to generate a shared library. +# We also look at how we can run the shared library in python runtime using :func:`torch._export.aot_load`. +# +# **Contents** +# +# .. contents:: +# :local: + + +###################################################################### +# Model Compilation +# ------------ +# +# We will use TorchVision's pretrained `ResNet18` model in this example and use TorchInductor on the +# exported PyTorch program using :func:`torch._export.aot_compile` +# +# .. note:: +# +# This API also supports :func:`torch.compile` options like `mode` +# As an example, if used on a CUDA enabled device, we can set `"max_autotune": True` +# +# We also specify `dynamic_shapes` for the batch dimension. In this example, min=2 is not a bug and is +# explained in `The 0/1 Specialization Problem `__ + + +import os +import torch +from torchvision.models import ResNet18_Weights, resnet18 + +model = resnet18(weights=ResNet18_Weights.DEFAULT) +model.eval() + +with torch.inference_mode(): + + # Specify the generated shared library path + aot_compile_options = { + "aot_inductor.output_path": os.path.join(os.getcwd(), "resnet18_pt2.so"), + } + if torch.cuda.is_available(): + device = "cuda" + aot_compile_options.update({"max_autotune": True}) + else: + device = "cpu" + # We need to turn off the below optimizations to support batch_size = 16, + # which is treated like a special case + # https://github.com/pytorch/pytorch/pull/116152 + torch.backends.mkldnn.set_flags(False) + torch.backends.nnpack.set_flags(False) + + model = model.to(device=device) + example_inputs = (torch.randn(2, 3, 224, 224, device=device),) + + # min=2 is not a bug and is explained in the 0/1 Specialization Problem + batch_dim = torch.export.Dim("batch", min=2, max=32) + so_path = torch._export.aot_compile( + model, + example_inputs, + # Specify the first dimension of the input x as dynamic + dynamic_shapes={"x": {0: batch_dim}}, + # Specify the generated shared library path + options=aot_compile_options + ) + + +###################################################################### +# Model Inference in Python +# ------------ +# +# Typically the shared object generated above is used in a non-Python environment. In PyTorch 2.3, +# we added a new API :func:`torch._export.aot_load` to load the shared library in python runtime. +# The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path +# of the shared library and the device where this should be loaded. +# .. note:: +# +# We specify batch_size=1 for inference and it works even though we specified min=2 in +# :func:`torch._export.aot_compile` + + +import os +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +model_so_path = os.path.join(os.getcwd(), "resnet18_pt2.so") + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + output = model(example_inputs) \ No newline at end of file From cd09129c21873b56518492d58caebd16bfebcb53 Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Tue, 13 Aug 2024 10:40:12 -0700 Subject: [PATCH 02/21] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- .../torch_export_aoti_python.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index 48168dcef73..9df68bae5fb 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -torch.export AOT Inductor Tutorial for Python runtime +(Beta) ``torch.export`` AOT Inductor Tutorial for Python runtime =================================================== **Author:** Ankith Gunapal """ @@ -20,7 +20,7 @@ # # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime. # We will look at how to use :func:`torch._export.aot_compile` to generate a shared library. -# We also look at how we can run the shared library in python runtime using :func:`torch._export.aot_load`. +# Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. # # **Contents** # @@ -32,15 +32,15 @@ # Model Compilation # ------------ # -# We will use TorchVision's pretrained `ResNet18` model in this example and use TorchInductor on the -# exported PyTorch program using :func:`torch._export.aot_compile` +# We will use TorchVision's pretrained `ResNet18` model and TorchInductor on the +# exported PyTorch program using :func:`torch._export.aot_compile`. # # .. note:: # # This API also supports :func:`torch.compile` options like `mode` # As an example, if used on a CUDA enabled device, we can set `"max_autotune": True` # -# We also specify `dynamic_shapes` for the batch dimension. In this example, min=2 is not a bug and is +# We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is # explained in `The 0/1 Specialization Problem `__ @@ -87,14 +87,14 @@ # Model Inference in Python # ------------ # -# Typically the shared object generated above is used in a non-Python environment. In PyTorch 2.3, -# we added a new API :func:`torch._export.aot_load` to load the shared library in python runtime. +# Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, +# we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. # The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path -# of the shared library and the device where this should be loaded. +# of the shared library and the device where it should be loaded. # .. note:: # -# We specify batch_size=1 for inference and it works even though we specified min=2 in -# :func:`torch._export.aot_compile` +# In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in +# :func:`torch._export.aot_compile`. import os From 3fa9b208c593a0683893f12c3d49bc1a53d99f11 Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 13 Aug 2024 20:10:53 +0000 Subject: [PATCH 03/21] Addressed review comments and added a section on why AOTI Python --- .jenkins/metadata.json | 3 + .../torch_export_aoti_python.py | 116 +++++++++++++++++- 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 4814f9a7d2b..5c6af7b80ff 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,6 +28,9 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, + "intermediate_source/torch_export_aoti_python.py": { + "needs": "linux.16xlarge.nvidia.gpu" + }, "advanced_source/pendulum.py": { "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index 9df68bae5fb..8279977c767 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -27,6 +27,19 @@ # .. contents:: # :local: +###################################################################### +# Prerequisites +# ------------- +# * PyTorch 2.4 or later +# * Basic understanding of ``torch._export`` and AOT Inductor +# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial + +###################################################################### +# What you will learn +# ---------------------- +# * How to use AOTInductor for python runtime. +# * How to use :func:`torch._export.aot_compile` to generate a shared library +# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`. ###################################################################### # Model Compilation @@ -37,8 +50,9 @@ # # .. note:: # -# This API also supports :func:`torch.compile` options like `mode` -# As an example, if used on a CUDA enabled device, we can set `"max_autotune": True` +# This API also supports :func:`torch.compile` options like ``mode`` +# This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True`` +# which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default. # # We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is # explained in `The 0/1 Specialization Problem `__ @@ -89,7 +103,7 @@ # # Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. -# The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path +# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path # of the shared library and the device where it should be loaded. # .. note:: # @@ -107,4 +121,98 @@ example_inputs = (torch.randn(1, 3, 224, 224, device=device),) with torch.inference_mode(): - output = model(example_inputs) \ No newline at end of file + output = model(example_inputs) + +###################################################################### +# When to use AOT Inductor Python Runtime +# --------------------------------------- +# +# One of the requirements for using AOT Inductor is that the model shouldn't have any graph breaks. +# Once this requirement is met, the primary use case for using AOT Inductor Python Runtime is for +# model deployment using Python. +# There are mainly two reasons why you would use AOT Inductor Python Runtime: +# +# - ``torch._export.aot_compile`` generates a shared library. This is useful for model +# versioning for deployments and tracking model performance over time. +# - With :func:`torch.compile` being a JIT compiler, there is a warmup +# cost associated with the first compilation. Your deployment needs to account for the +# compilation time taken for the first inference. With AOT Inductor, the compilation is +# done offline using ``torch._export.aot_compile``. The deployment would only load the +# shared library using ``torch._export.aot_load`` and run inference. +# +# +# The section below shows the speedup achieved with AOT Inductor for first inference +# +# We define a utility function ``timed`` to measure the time taken for inference +# + +import time +def timed(fn): + # Returns the result of running `fn()` and the time it took for `fn()` to run, + # in seconds. We use CUDA events and synchronization for accurate + # measurement on CUDA enabled devices. + if torch.cuda.is_available(): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + else: + start = time.time() + + result = fn() + if torch.cuda.is_available(): + end.record() + torch.cuda.synchronize() + else: + end = time.time() + + # Measure time taken to execute the function in miliseconds + if torch.cuda.is_available(): + duration = start.elapsed_time(end) + else: + duration = (end - start) * 1000 + + return result, duration + + +###################################################################### +# Lets measure the time for first inference using AOT Inductor + +torch._dynamo.reset() + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for AOT Inductor is {time_taken:.2f} ms") + + +###################################################################### +# Lets measure the time for first inference using ``torch.compile`` + +torch._dynamo.reset() + +model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device) +model.eval() + +model = torch.compile(model) +example_inputs = torch.randn(1, 3, 224, 224, device=device) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms") + +###################################################################### +# We see that there is a drastic speedup in first inference time using AOT Inductor compared +# to ``torch.compile`` + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we have learned how to effectively use the AOTInductor for Python runtime by +# compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile`` +# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of +# generating a shared library and running it within a Python environment, even with dynamic shape +# considerations and device-specific optimizations. We also looked at the advantage of using +# AOT Inductor in model deployments, with regards to speed up in first inference time. From 7c9edb7991ee8b3f37fd914f4e30094b8815c4e5 Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 13 Aug 2024 20:13:55 +0000 Subject: [PATCH 04/21] Addressed review comments and added a section on why AOTI Python --- .../torch_export_aoti_python.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index 8279977c767..9ac4e344919 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -(Beta) ``torch.export`` AOT Inductor Tutorial for Python runtime +(Beta) ``torch.export`` AOTInductor Tutorial for Python runtime =================================================== **Author:** Ankith Gunapal """ @@ -31,7 +31,7 @@ # Prerequisites # ------------- # * PyTorch 2.4 or later -# * Basic understanding of ``torch._export`` and AOT Inductor +# * Basic understanding of ``torch._export`` and AOTInductor # * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial ###################################################################### @@ -40,6 +40,7 @@ # * How to use AOTInductor for python runtime. # * How to use :func:`torch._export.aot_compile` to generate a shared library # * How to run a shared library in Python runtime using :func:`torch._export.aot_load`. +# * When do you use AOTInductor for python runtime ###################################################################### # Model Compilation @@ -124,24 +125,24 @@ output = model(example_inputs) ###################################################################### -# When to use AOT Inductor Python Runtime +# When to use AOTInductor for Python Runtime # --------------------------------------- # -# One of the requirements for using AOT Inductor is that the model shouldn't have any graph breaks. -# Once this requirement is met, the primary use case for using AOT Inductor Python Runtime is for +# One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks. +# Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for # model deployment using Python. -# There are mainly two reasons why you would use AOT Inductor Python Runtime: +# There are mainly two reasons why you would use AOTInductor Python Runtime: # # - ``torch._export.aot_compile`` generates a shared library. This is useful for model # versioning for deployments and tracking model performance over time. # - With :func:`torch.compile` being a JIT compiler, there is a warmup # cost associated with the first compilation. Your deployment needs to account for the -# compilation time taken for the first inference. With AOT Inductor, the compilation is +# compilation time taken for the first inference. With AOTInductor, the compilation is # done offline using ``torch._export.aot_compile``. The deployment would only load the # shared library using ``torch._export.aot_load`` and run inference. # # -# The section below shows the speedup achieved with AOT Inductor for first inference +# The section below shows the speedup achieved with AOTInductor for first inference # # We define a utility function ``timed`` to measure the time taken for inference # @@ -175,7 +176,7 @@ def timed(fn): ###################################################################### -# Lets measure the time for first inference using AOT Inductor +# Lets measure the time for first inference using AOTInductor torch._dynamo.reset() @@ -184,7 +185,7 @@ def timed(fn): with torch.inference_mode(): _, time_taken = timed(lambda: model(example_inputs)) - print(f"Time taken for first inference for AOT Inductor is {time_taken:.2f} ms") + print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms") ###################################################################### @@ -203,7 +204,7 @@ def timed(fn): print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms") ###################################################################### -# We see that there is a drastic speedup in first inference time using AOT Inductor compared +# We see that there is a drastic speedup in first inference time using AOTInductor compared # to ``torch.compile`` ###################################################################### @@ -215,4 +216,4 @@ def timed(fn): # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of # generating a shared library and running it within a Python environment, even with dynamic shape # considerations and device-specific optimizations. We also looked at the advantage of using -# AOT Inductor in model deployments, with regards to speed up in first inference time. +# AOTInductor in model deployments, with regards to speed up in first inference time. From 9cba6fbebc20b52e85bc48ed0aeabe0738e08c9f Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 13 Aug 2024 21:37:43 +0000 Subject: [PATCH 05/21] fixed spelling --- en-wordlist.txt | 3 ++- intermediate_source/torch_export_aoti_python.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/en-wordlist.txt b/en-wordlist.txt index 62762ab69cc..e69cbaa1a5f 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -2,6 +2,7 @@ ACL ADI AOT +AOTInductor APIs ATen AVX @@ -617,4 +618,4 @@ warmstarting warmup webp wsi -wsis \ No newline at end of file +wsis diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index 9ac4e344919..5102ce4acdd 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -46,7 +46,7 @@ # Model Compilation # ------------ # -# We will use TorchVision's pretrained `ResNet18` model and TorchInductor on the +# We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the # exported PyTorch program using :func:`torch._export.aot_compile`. # # .. note:: From a6f6cd991614c9da45ff9e9c13301a6f44e60338 Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 13 Aug 2024 21:42:35 +0000 Subject: [PATCH 06/21] fixed spelling --- intermediate_source/torch_export_aoti_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index 5102ce4acdd..aa1b85bb1a2 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -3,7 +3,7 @@ """ (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime =================================================== -**Author:** Ankith Gunapal +**Author:** Ankith Gunapal, Bin Bao """ ###################################################################### From 1375373b837aa307cb12777c2b0df42a0e7c576e Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Fri, 16 Aug 2024 14:24:14 -0700 Subject: [PATCH 07/21] Apply suggestions from code review Co-authored-by: Angela Yi Co-authored-by: Svetlana Karslioglu --- intermediate_source/torch_export_aoti_python.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index aa1b85bb1a2..a5e482b7361 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -88,11 +88,15 @@ # min=2 is not a bug and is explained in the 0/1 Specialization Problem batch_dim = torch.export.Dim("batch", min=2, max=32) - so_path = torch._export.aot_compile( + exported_program = torch.export.export( model, example_inputs, # Specify the first dimension of the input x as dynamic dynamic_shapes={"x": {0: batch_dim}}, + ) + so_path = torch._inductor.aot_compile( + exported_program.module(), + example_inputs, # Specify the generated shared library path options=aot_compile_options ) @@ -211,7 +215,7 @@ def timed(fn): # Conclusion # ---------- # -# In this tutorial, we have learned how to effectively use the AOTInductor for Python runtime by +# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by # compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile`` # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of # generating a shared library and running it within a Python environment, even with dynamic shape From 71589859be8bfd3405e5427785caa8d1ae620fb8 Mon Sep 17 00:00:00 2001 From: agunapal Date: Fri, 16 Aug 2024 23:03:18 +0000 Subject: [PATCH 08/21] Addressed review comment --- intermediate_source/torch_export_aoti_python.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py index a5e482b7361..321892688b1 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/intermediate_source/torch_export_aoti_python.py @@ -21,6 +21,8 @@ # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime. # We will look at how to use :func:`torch._export.aot_compile` to generate a shared library. # Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. +# You will learn about the speed up seen in the first inference time using AOTInductor, especially when using +# ``max-autotune`` mode which can take some time to execute. # # **Contents** # From 53f59654ac357f3bb39e825c895d014bd60ddd23 Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 19 Aug 2024 17:33:02 +0000 Subject: [PATCH 09/21] Changing to use g5.4xlarge machine --- .jenkins/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 5c6af7b80ff..155965abf1b 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -29,7 +29,7 @@ "needs": "linux.16xlarge.nvidia.gpu" }, "intermediate_source/torch_export_aoti_python.py": { - "needs": "linux.16xlarge.nvidia.gpu" + "needs": "linux.g5.4xlarge.nvidia.gpu" }, "advanced_source/pendulum.py": { "needs": "linux.g5.4xlarge.nvidia.gpu", From 4aa8399afff4999b078072729f173031bf48d652 Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 19 Aug 2024 19:59:35 +0000 Subject: [PATCH 10/21] Moved tutorial to recipe --- .../torch_export_aoti_python.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) rename {intermediate_source => recipes_source}/torch_export_aoti_python.py (88%) diff --git a/intermediate_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py similarity index 88% rename from intermediate_source/torch_export_aoti_python.py rename to recipes_source/torch_export_aoti_python.py index 321892688b1..46179531d04 100644 --- a/intermediate_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -3,14 +3,14 @@ """ (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime =================================================== -**Author:** Ankith Gunapal, Bin Bao +**Author:** Ankith Gunapal, Bin Bao, Angela Yi """ ###################################################################### # # .. warning:: # -# ``torch._export.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility +# ``torch._inductor.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility # breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime. # # It has been shown `previously `__ how AOTInductor can be used @@ -19,8 +19,8 @@ # # # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime. -# We will look at how to use :func:`torch._export.aot_compile` to generate a shared library. -# Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. +# We will look at how to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a +# shared library. Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. # You will learn about the speed up seen in the first inference time using AOTInductor, especially when using # ``max-autotune`` mode which can take some time to execute. # @@ -33,14 +33,14 @@ # Prerequisites # ------------- # * PyTorch 2.4 or later -# * Basic understanding of ``torch._export`` and AOTInductor +# * Basic understanding of ``torch.export`` and AOTInductor # * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial ###################################################################### # What you will learn # ---------------------- # * How to use AOTInductor for python runtime. -# * How to use :func:`torch._export.aot_compile` to generate a shared library +# * How to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a shared library # * How to run a shared library in Python runtime using :func:`torch._export.aot_load`. # * When do you use AOTInductor for python runtime @@ -49,7 +49,7 @@ # ------------ # # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the -# exported PyTorch program using :func:`torch._export.aot_compile`. +# exported PyTorch program using :func:`torch._inductor.aot_compile`. # # .. note:: # @@ -115,7 +115,7 @@ # .. note:: # # In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in -# :func:`torch._export.aot_compile`. +# :func:`torch.export.export`. import os @@ -139,13 +139,13 @@ # model deployment using Python. # There are mainly two reasons why you would use AOTInductor Python Runtime: # -# - ``torch._export.aot_compile`` generates a shared library. This is useful for model +# - ``torch._inductor.aot_compile`` generates a shared library. This is useful for model # versioning for deployments and tracking model performance over time. # - With :func:`torch.compile` being a JIT compiler, there is a warmup # cost associated with the first compilation. Your deployment needs to account for the # compilation time taken for the first inference. With AOTInductor, the compilation is -# done offline using ``torch._export.aot_compile``. The deployment would only load the -# shared library using ``torch._export.aot_load`` and run inference. +# done offline using ``torch.export.export`` & ``torch._indutor.aot_compile``. The deployment +# would only load the shared library using ``torch._export.aot_load`` and run inference. # # # The section below shows the speedup achieved with AOTInductor for first inference @@ -218,7 +218,7 @@ def timed(fn): # ---------- # # In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by -# compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile`` +# compiling and loading a pretrained ``ResNet18`` model using the ``torch._inductor.aot_compile`` # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of # generating a shared library and running it within a Python environment, even with dynamic shape # considerations and device-specific optimizations. We also looked at the advantage of using From 35c5dc892cc9725b3b1fcfb8f8f16d79247bb1fd Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 19 Aug 2024 20:03:39 +0000 Subject: [PATCH 11/21] addressed review comments --- recipes_source/torch_export_aoti_python.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index 46179531d04..702d9d67f95 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -79,11 +79,6 @@ aot_compile_options.update({"max_autotune": True}) else: device = "cpu" - # We need to turn off the below optimizations to support batch_size = 16, - # which is treated like a special case - # https://github.com/pytorch/pytorch/pull/116152 - torch.backends.mkldnn.set_flags(False) - torch.backends.nnpack.set_flags(False) model = model.to(device=device) example_inputs = (torch.randn(2, 3, 224, 224, device=device),) From 71acd963ca898af95674a5c9ef7f0ecd2f5aa248 Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 19 Aug 2024 20:48:31 +0000 Subject: [PATCH 12/21] Moved tutorial to recipe --- .jenkins/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 155965abf1b..2f1a9933aab 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,7 +28,7 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, - "intermediate_source/torch_export_aoti_python.py": { + "recipes_source/torch_export_aoti_python.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, "advanced_source/pendulum.py": { From 7f5fde997030eb1767319dd36f2cdb0099d1bd1b Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 20 Aug 2024 00:00:31 +0000 Subject: [PATCH 13/21] Change base image to nvidia devel image --- .ci/docker/build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 31f42fdbd85..ad1a7395baf 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -11,8 +11,9 @@ IMAGE_NAME="$1" shift export UBUNTU_VERSION="20.04" +export CUDA_VERSION="12.1.1" -export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}" +export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" echo "Building ${IMAGE_NAME} Docker image" docker build \ From 790f7625f0dd28ee2fcb7271405860bb8ba76f65 Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 20 Aug 2024 01:12:10 +0000 Subject: [PATCH 14/21] Change base image to nvidia devel image --- .ci/docker/common/common_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh index b20286a4099..c7eabda555d 100644 --- a/.ci/docker/common/common_utils.sh +++ b/.ci/docker/common/common_utils.sh @@ -22,5 +22,5 @@ conda_run() { } pip_install() { - as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* + as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip3 install --progress-bar off $* } From 45df5d0c61d919bdbb115a523cdeef165c0412da Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 20 Aug 2024 21:57:24 +0000 Subject: [PATCH 15/21] Update requirements --- .ci/docker/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 00cf2f21033..9668b17fc3a 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -30,8 +30,8 @@ pytorch-lightning torchx torchrl==0.5.0 tensordict==0.5.0 -ax-platform>==0.4.0 -nbformat>==5.9.2 +ax-platform>=0.4.0 +nbformat>=5.9.2 datasets transformers torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable @@ -68,4 +68,4 @@ pygame==2.1.2 pycocotools semilearn==0.3.2 torchao==0.0.3 -segment_anything==1.0 \ No newline at end of file +segment_anything==1.0 From b268a3c5669751e1e567100ffe3404331b9b385d Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 20 Aug 2024 23:49:19 +0000 Subject: [PATCH 16/21] fixed formatting --- recipes_source/torch_export_aoti_python.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index 702d9d67f95..7f9438bdc17 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -2,7 +2,7 @@ """ (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime -=================================================== +=============================================================== **Author:** Ankith Gunapal, Bin Bao, Angela Yi """ @@ -46,7 +46,7 @@ ###################################################################### # Model Compilation -# ------------ +# ----------------- # # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the # exported PyTorch program using :func:`torch._inductor.aot_compile`. @@ -101,7 +101,7 @@ ###################################################################### # Model Inference in Python -# ------------ +# ------------------------- # # Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. @@ -127,7 +127,7 @@ ###################################################################### # When to use AOTInductor for Python Runtime -# --------------------------------------- +# ------------------------------------------ # # One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks. # Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for From 6578d82a4b261a857e67ece73ee38545cbb17189 Mon Sep 17 00:00:00 2001 From: agunapal Date: Tue, 20 Aug 2024 23:58:44 +0000 Subject: [PATCH 17/21] update to CUDA 12.4 --- .ci/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index ad1a7395baf..c646b8f9a86 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -11,7 +11,7 @@ IMAGE_NAME="$1" shift export UBUNTU_VERSION="20.04" -export CUDA_VERSION="12.1.1" +export CUDA_VERSION="12.4.1" export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" echo "Building ${IMAGE_NAME} Docker image" From 67bc0807555194128e1cc621679c84161dbea21a Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Wed, 21 Aug 2024 13:23:42 -0700 Subject: [PATCH 18/21] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- recipes_source/torch_export_aoti_python.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index 7f9438bdc17..786dadb369a 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -51,7 +51,7 @@ # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the # exported PyTorch program using :func:`torch._inductor.aot_compile`. # -# .. note:: +# .. note:: # # This API also supports :func:`torch.compile` options like ``mode`` # This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True`` @@ -107,7 +107,7 @@ # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. # The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path # of the shared library and the device where it should be loaded. -# .. note:: +# .. note:: # # In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in # :func:`torch.export.export`. From fc0ff5ee18436af5e2831d4bc8d42a96bdb2cd39 Mon Sep 17 00:00:00 2001 From: agunapal Date: Wed, 21 Aug 2024 22:49:07 +0000 Subject: [PATCH 19/21] addressed review comments for formatting --- recipes_source/recipes_index.rst | 6 ++++++ recipes_source/torch_export_aoti_python.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index d94d7d5c22e..caccdcc28f7 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -150,6 +150,12 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/swap_tensors.html :tags: Basics +.. customcarditem:: + :header: torch.export AOTInductor Tutorial for Python runtime + :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_export_aoti_python.html + :tags: Basics .. Interpretability diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index 786dadb369a..5a0c1f4f6a8 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -32,9 +32,9 @@ ###################################################################### # Prerequisites # ------------- -# * PyTorch 2.4 or later -# * Basic understanding of ``torch.export`` and AOTInductor -# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial +# * PyTorch 2.4 or later +# * Basic understanding of ``torch.export`` and AOTInductor +# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial ###################################################################### # What you will learn From 85f287019f181f3e224e8a5f3699630aa13f78f7 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 22 Aug 2024 08:55:57 -0700 Subject: [PATCH 20/21] Update recipes_source/torch_export_aoti_python.py --- recipes_source/torch_export_aoti_python.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index 5a0c1f4f6a8..b3bdd2b7fe2 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -107,6 +107,7 @@ # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. # The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path # of the shared library and the device where it should be loaded. +# # .. note:: # # In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in From cb8ea23d09d7204b822a8150bd681395c389e45c Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Thu, 22 Aug 2024 08:56:16 -0700 Subject: [PATCH 21/21] Update recipes_source/torch_export_aoti_python.py --- recipes_source/torch_export_aoti_python.py | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py index b3bdd2b7fe2..136862078c1 100644 --- a/recipes_source/torch_export_aoti_python.py +++ b/recipes_source/torch_export_aoti_python.py @@ -109,7 +109,6 @@ # of the shared library and the device where it should be loaded. # # .. note:: -# # In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in # :func:`torch.export.export`.