From 1dea27833a1b2d0144cde38b9b0b0e8c3aaafef6 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 12 Aug 2024 23:32:22 +0000
Subject: [PATCH 01/21] Tutorial for AOTI Python runtime

---
 .../torch_export_aoti_python.py               | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 intermediate_source/torch_export_aoti_python.py

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
new file mode 100644
index 00000000000..48168dcef73
--- /dev/null
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+
+"""
+torch.export AOT Inductor Tutorial for Python runtime
+===================================================
+**Author:** Ankith Gunapal
+"""
+
+######################################################################
+#
+# .. warning::
+#
+#     ``torch._export.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility
+#     breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime.
+#
+# It has been shown `previously <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`__ how AOTInductor can be used 
+# to do Ahead-of-Time compilation of PyTorch exported models by creating
+# a shared library that can be run in a non-Python environment.
+#
+#
+# In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime.
+# We will look at how  to use :func:`torch._export.aot_compile` to generate a shared library.
+# We also look at how we can run the shared library in python runtime using :func:`torch._export.aot_load`.
+#
+# **Contents**
+#
+# .. contents::
+#     :local:
+
+
+######################################################################
+# Model Compilation
+# ------------
+#
+# We will use TorchVision's pretrained `ResNet18` model in this example and use TorchInductor on the 
+# exported PyTorch program using :func:`torch._export.aot_compile`
+#
+#  .. note::
+#
+#       This API also supports :func:`torch.compile` options like `mode`
+#       As an example, if used on a CUDA enabled device, we can set `"max_autotune": True`
+#
+# We also specify `dynamic_shapes` for the batch dimension. In this example, min=2 is not a bug and is 
+# explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__
+
+
+import os
+import torch
+from torchvision.models import ResNet18_Weights, resnet18
+
+model = resnet18(weights=ResNet18_Weights.DEFAULT)
+model.eval()
+
+with torch.inference_mode():
+
+    # Specify the generated shared library path
+    aot_compile_options = {
+            "aot_inductor.output_path": os.path.join(os.getcwd(), "resnet18_pt2.so"),
+    }
+    if torch.cuda.is_available():
+        device = "cuda"
+        aot_compile_options.update({"max_autotune": True})
+    else:
+        device = "cpu"
+        # We need to turn off the below optimizations to support batch_size = 16,
+        # which is treated like a special case
+        # https://github.com/pytorch/pytorch/pull/116152
+        torch.backends.mkldnn.set_flags(False)
+        torch.backends.nnpack.set_flags(False)
+
+    model = model.to(device=device)
+    example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
+
+    # min=2 is not a bug and is explained in the 0/1 Specialization Problem
+    batch_dim = torch.export.Dim("batch", min=2, max=32)
+    so_path = torch._export.aot_compile(
+        model,
+        example_inputs,
+        # Specify the first dimension of the input x as dynamic
+        dynamic_shapes={"x": {0: batch_dim}},
+        # Specify the generated shared library path
+        options=aot_compile_options
+    )
+
+
+######################################################################
+# Model Inference in Python
+# ------------
+#
+# Typically the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
+# we added a new API :func:`torch._export.aot_load` to load the shared library in python runtime.
+# The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path 
+# of the shared library and the device where this should be loaded.
+#  .. note::
+#
+#      We specify batch_size=1 for inference and it works even though we specified min=2 in 
+#      :func:`torch._export.aot_compile`
+
+
+import os
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_so_path = os.path.join(os.getcwd(), "resnet18_pt2.so")
+
+model = torch._export.aot_load(model_so_path, device)
+example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+
+with torch.inference_mode():
+    output = model(example_inputs)
\ No newline at end of file

From cd09129c21873b56518492d58caebd16bfebcb53 Mon Sep 17 00:00:00 2001
From: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Date: Tue, 13 Aug 2024 10:40:12 -0700
Subject: [PATCH 02/21] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 .../torch_export_aoti_python.py               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index 48168dcef73..9df68bae5fb 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 """
-torch.export AOT Inductor Tutorial for Python runtime
+(Beta) ``torch.export`` AOT Inductor Tutorial for Python runtime
 ===================================================
 **Author:** Ankith Gunapal
 """
@@ -20,7 +20,7 @@
 #
 # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime.
 # We will look at how  to use :func:`torch._export.aot_compile` to generate a shared library.
-# We also look at how we can run the shared library in python runtime using :func:`torch._export.aot_load`.
+# Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`.
 #
 # **Contents**
 #
@@ -32,15 +32,15 @@
 # Model Compilation
 # ------------
 #
-# We will use TorchVision's pretrained `ResNet18` model in this example and use TorchInductor on the 
-# exported PyTorch program using :func:`torch._export.aot_compile`
+# We will use TorchVision's pretrained `ResNet18` model and TorchInductor on the 
+# exported PyTorch program using :func:`torch._export.aot_compile`.
 #
 #  .. note::
 #
 #       This API also supports :func:`torch.compile` options like `mode`
 #       As an example, if used on a CUDA enabled device, we can set `"max_autotune": True`
 #
-# We also specify `dynamic_shapes` for the batch dimension. In this example, min=2 is not a bug and is 
+# We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is 
 # explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__
 
 
@@ -87,14 +87,14 @@
 # Model Inference in Python
 # ------------
 #
-# Typically the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
-# we added a new API :func:`torch._export.aot_load` to load the shared library in python runtime.
+# Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
+# we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
 # The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path 
-# of the shared library and the device where this should be loaded.
+# of the shared library and the device where it should be loaded.
 #  .. note::
 #
-#      We specify batch_size=1 for inference and it works even though we specified min=2 in 
-#      :func:`torch._export.aot_compile`
+#      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 
+#      :func:`torch._export.aot_compile`.
 
 
 import os

From 3fa9b208c593a0683893f12c3d49bc1a53d99f11 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 13 Aug 2024 20:10:53 +0000
Subject: [PATCH 03/21] Addressed review comments and added a section on why
 AOTI Python

---
 .jenkins/metadata.json                        |   3 +
 .../torch_export_aoti_python.py               | 116 +++++++++++++++++-
 2 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 4814f9a7d2b..5c6af7b80ff 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -28,6 +28,9 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
+  "intermediate_source/torch_export_aoti_python.py": {
+    "needs": "linux.16xlarge.nvidia.gpu"
+  }, 
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index 9df68bae5fb..8279977c767 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -27,6 +27,19 @@
 # .. contents::
 #     :local:
 
+######################################################################
+# Prerequisites
+# -------------
+#   * PyTorch 2.4 or later
+#   * Basic understanding of ``torch._export`` and AOT Inductor
+#   * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
+
+######################################################################
+# What you will learn
+# ----------------------
+# * How to use AOTInductor for python runtime.
+# * How  to use :func:`torch._export.aot_compile` to generate a shared library
+# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`.
 
 ######################################################################
 # Model Compilation
@@ -37,8 +50,9 @@
 #
 #  .. note::
 #
-#       This API also supports :func:`torch.compile` options like `mode`
-#       As an example, if used on a CUDA enabled device, we can set `"max_autotune": True`
+#       This API also supports :func:`torch.compile` options like ``mode``
+#       This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True``
+#       which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default.
 #
 # We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is 
 # explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__
@@ -89,7 +103,7 @@
 #
 # Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
 # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
-# The API follows a similar structure to the :func:`torch.jit.load` API . We specify the path 
+# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path 
 # of the shared library and the device where it should be loaded.
 #  .. note::
 #
@@ -107,4 +121,98 @@
 example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
-    output = model(example_inputs)
\ No newline at end of file
+    output = model(example_inputs)
+
+######################################################################
+# When to use AOT Inductor Python Runtime
+# ---------------------------------------
+#
+# One of the requirements for using AOT Inductor is that the model shouldn't have any graph breaks.
+# Once this requirement is met, the primary use case for using AOT Inductor Python Runtime is for
+# model deployment using Python.
+# There are mainly two reasons why you would use AOT Inductor Python Runtime:
+#
+# -  ``torch._export.aot_compile`` generates a shared library. This is useful for model
+#    versioning for deployments and tracking model performance over time.
+# -  With :func:`torch.compile` being a JIT compiler, there is a warmup
+#    cost associated with the first compilation. Your deployment needs to account for the
+#    compilation time taken for the first inference. With AOT Inductor, the compilation is
+#    done offline using ``torch._export.aot_compile``. The deployment would only load the
+#    shared library using ``torch._export.aot_load`` and run inference.
+#
+#
+# The section below shows the speedup achieved with AOT Inductor for first inference
+#
+# We define a utility function ``timed`` to measure the time taken for inference
+#
+
+import time
+def timed(fn):
+    # Returns the result of running `fn()` and the time it took for `fn()` to run,
+    # in seconds. We use CUDA events and synchronization for accurate
+    # measurement on CUDA enabled devices.
+    if torch.cuda.is_available():
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+    else:
+        start = time.time()
+
+    result = fn()
+    if torch.cuda.is_available():
+        end.record()
+        torch.cuda.synchronize()
+    else:
+        end = time.time()
+
+    # Measure time taken to execute the function in miliseconds
+    if torch.cuda.is_available():
+        duration = start.elapsed_time(end)
+    else:
+        duration = (end - start) * 1000
+
+    return result, duration
+
+
+######################################################################
+# Lets measure the time for first inference using AOT Inductor
+
+torch._dynamo.reset()
+
+model = torch._export.aot_load(model_so_path, device)
+example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for AOT Inductor is {time_taken:.2f} ms")
+
+
+######################################################################
+# Lets measure the time for first inference using ``torch.compile``
+
+torch._dynamo.reset()
+
+model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
+model.eval()
+
+model = torch.compile(model)
+example_inputs = torch.randn(1, 3, 224, 224, device=device)
+
+with torch.inference_mode():
+    _, time_taken = timed(lambda: model(example_inputs))
+    print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms")
+
+######################################################################
+# We see that there is a drastic speedup in first inference time using AOT Inductor compared
+# to ``torch.compile``
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we have learned how to effectively use the AOTInductor for Python runtime by 
+# compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile``
+# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
+# generating a shared library and running it within a Python environment, even with dynamic shape
+# considerations and device-specific optimizations. We also looked at the advantage of using 
+# AOT Inductor in model deployments, with regards to speed up in first inference time.

From 7c9edb7991ee8b3f37fd914f4e30094b8815c4e5 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 13 Aug 2024 20:13:55 +0000
Subject: [PATCH 04/21] Addressed review comments and added a section on why
 AOTI Python

---
 .../torch_export_aoti_python.py               | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index 8279977c767..9ac4e344919 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 """
-(Beta) ``torch.export`` AOT Inductor Tutorial for Python runtime
+(Beta) ``torch.export`` AOTInductor Tutorial for Python runtime
 ===================================================
 **Author:** Ankith Gunapal
 """
@@ -31,7 +31,7 @@
 # Prerequisites
 # -------------
 #   * PyTorch 2.4 or later
-#   * Basic understanding of ``torch._export`` and AOT Inductor
+#   * Basic understanding of ``torch._export`` and AOTInductor
 #   * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
 
 ######################################################################
@@ -40,6 +40,7 @@
 # * How to use AOTInductor for python runtime.
 # * How  to use :func:`torch._export.aot_compile` to generate a shared library
 # * How to run a shared library in Python runtime using :func:`torch._export.aot_load`.
+# * When do you use AOTInductor for python runtime
 
 ######################################################################
 # Model Compilation
@@ -124,24 +125,24 @@
     output = model(example_inputs)
 
 ######################################################################
-# When to use AOT Inductor Python Runtime
+# When to use AOTInductor for Python Runtime
 # ---------------------------------------
 #
-# One of the requirements for using AOT Inductor is that the model shouldn't have any graph breaks.
-# Once this requirement is met, the primary use case for using AOT Inductor Python Runtime is for
+# One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks.
+# Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for
 # model deployment using Python.
-# There are mainly two reasons why you would use AOT Inductor Python Runtime:
+# There are mainly two reasons why you would use AOTInductor Python Runtime:
 #
 # -  ``torch._export.aot_compile`` generates a shared library. This is useful for model
 #    versioning for deployments and tracking model performance over time.
 # -  With :func:`torch.compile` being a JIT compiler, there is a warmup
 #    cost associated with the first compilation. Your deployment needs to account for the
-#    compilation time taken for the first inference. With AOT Inductor, the compilation is
+#    compilation time taken for the first inference. With AOTInductor, the compilation is
 #    done offline using ``torch._export.aot_compile``. The deployment would only load the
 #    shared library using ``torch._export.aot_load`` and run inference.
 #
 #
-# The section below shows the speedup achieved with AOT Inductor for first inference
+# The section below shows the speedup achieved with AOTInductor for first inference
 #
 # We define a utility function ``timed`` to measure the time taken for inference
 #
@@ -175,7 +176,7 @@ def timed(fn):
 
 
 ######################################################################
-# Lets measure the time for first inference using AOT Inductor
+# Lets measure the time for first inference using AOTInductor
 
 torch._dynamo.reset()
 
@@ -184,7 +185,7 @@ def timed(fn):
 
 with torch.inference_mode():
     _, time_taken = timed(lambda: model(example_inputs))
-    print(f"Time taken for first inference for AOT Inductor is {time_taken:.2f} ms")
+    print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms")
 
 
 ######################################################################
@@ -203,7 +204,7 @@ def timed(fn):
     print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms")
 
 ######################################################################
-# We see that there is a drastic speedup in first inference time using AOT Inductor compared
+# We see that there is a drastic speedup in first inference time using AOTInductor compared
 # to ``torch.compile``
 
 ######################################################################
@@ -215,4 +216,4 @@ def timed(fn):
 # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
 # generating a shared library and running it within a Python environment, even with dynamic shape
 # considerations and device-specific optimizations. We also looked at the advantage of using 
-# AOT Inductor in model deployments, with regards to speed up in first inference time.
+# AOTInductor in model deployments, with regards to speed up in first inference time.

From 9cba6fbebc20b52e85bc48ed0aeabe0738e08c9f Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 13 Aug 2024 21:37:43 +0000
Subject: [PATCH 05/21] fixed spelling

---
 en-wordlist.txt                                 | 3 ++-
 intermediate_source/torch_export_aoti_python.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/en-wordlist.txt b/en-wordlist.txt
index 62762ab69cc..e69cbaa1a5f 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -2,6 +2,7 @@
 ACL
 ADI
 AOT
+AOTInductor
 APIs
 ATen
 AVX
@@ -617,4 +618,4 @@ warmstarting
 warmup
 webp
 wsi
-wsis
\ No newline at end of file
+wsis
diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index 9ac4e344919..5102ce4acdd 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -46,7 +46,7 @@
 # Model Compilation
 # ------------
 #
-# We will use TorchVision's pretrained `ResNet18` model and TorchInductor on the 
+# We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the 
 # exported PyTorch program using :func:`torch._export.aot_compile`.
 #
 #  .. note::

From a6f6cd991614c9da45ff9e9c13301a6f44e60338 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 13 Aug 2024 21:42:35 +0000
Subject: [PATCH 06/21] fixed spelling

---
 intermediate_source/torch_export_aoti_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index 5102ce4acdd..aa1b85bb1a2 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -3,7 +3,7 @@
 """
 (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime
 ===================================================
-**Author:** Ankith Gunapal
+**Author:** Ankith Gunapal, Bin Bao
 """
 
 ######################################################################

From 1375373b837aa307cb12777c2b0df42a0e7c576e Mon Sep 17 00:00:00 2001
From: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Date: Fri, 16 Aug 2024 14:24:14 -0700
Subject: [PATCH 07/21] Apply suggestions from code review

Co-authored-by: Angela Yi <angelayi@meta.com>
Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 intermediate_source/torch_export_aoti_python.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index aa1b85bb1a2..a5e482b7361 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -88,11 +88,15 @@
 
     # min=2 is not a bug and is explained in the 0/1 Specialization Problem
     batch_dim = torch.export.Dim("batch", min=2, max=32)
-    so_path = torch._export.aot_compile(
+    exported_program = torch.export.export(
         model,
         example_inputs,
         # Specify the first dimension of the input x as dynamic
         dynamic_shapes={"x": {0: batch_dim}},
+    )
+    so_path = torch._inductor.aot_compile(
+        exported_program.module(),
+        example_inputs,
         # Specify the generated shared library path
         options=aot_compile_options
     )
@@ -211,7 +215,7 @@ def timed(fn):
 # Conclusion
 # ----------
 #
-# In this tutorial, we have learned how to effectively use the AOTInductor for Python runtime by 
+# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by 
 # compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile``
 # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
 # generating a shared library and running it within a Python environment, even with dynamic shape

From 71589859be8bfd3405e5427785caa8d1ae620fb8 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Fri, 16 Aug 2024 23:03:18 +0000
Subject: [PATCH 08/21] Addressed review comment

---
 intermediate_source/torch_export_aoti_python.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/intermediate_source/torch_export_aoti_python.py b/intermediate_source/torch_export_aoti_python.py
index a5e482b7361..321892688b1 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/intermediate_source/torch_export_aoti_python.py
@@ -21,6 +21,8 @@
 # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime.
 # We will look at how  to use :func:`torch._export.aot_compile` to generate a shared library.
 # Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`.
+# You will learn about the speed up seen in the first inference time using AOTInductor, especially when using 
+# ``max-autotune`` mode which can take some time to execute.
 #
 # **Contents**
 #

From 53f59654ac357f3bb39e825c895d014bd60ddd23 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 19 Aug 2024 17:33:02 +0000
Subject: [PATCH 09/21] Changing to use g5.4xlarge machine

---
 .jenkins/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 5c6af7b80ff..155965abf1b 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -29,7 +29,7 @@
     "needs": "linux.16xlarge.nvidia.gpu"
   },
   "intermediate_source/torch_export_aoti_python.py": {
-    "needs": "linux.16xlarge.nvidia.gpu"
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
   }, 
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",

From 4aa8399afff4999b078072729f173031bf48d652 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 19 Aug 2024 19:59:35 +0000
Subject: [PATCH 10/21] Moved tutorial to recipe

---
 .../torch_export_aoti_python.py               | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)
 rename {intermediate_source => recipes_source}/torch_export_aoti_python.py (88%)

diff --git a/intermediate_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
similarity index 88%
rename from intermediate_source/torch_export_aoti_python.py
rename to recipes_source/torch_export_aoti_python.py
index 321892688b1..46179531d04 100644
--- a/intermediate_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -3,14 +3,14 @@
 """
 (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime
 ===================================================
-**Author:** Ankith Gunapal, Bin Bao
+**Author:** Ankith Gunapal, Bin Bao, Angela Yi
 """
 
 ######################################################################
 #
 # .. warning::
 #
-#     ``torch._export.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility
+#     ``torch._inductor.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility
 #     breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime.
 #
 # It has been shown `previously <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`__ how AOTInductor can be used 
@@ -19,8 +19,8 @@
 #
 #
 # In this tutorial, you will learn an end-to-end example of how to use AOTInductor for python runtime.
-# We will look at how  to use :func:`torch._export.aot_compile` to generate a shared library.
-# Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`.
+# We will look at how  to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a 
+# shared library. Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`.
 # You will learn about the speed up seen in the first inference time using AOTInductor, especially when using 
 # ``max-autotune`` mode which can take some time to execute.
 #
@@ -33,14 +33,14 @@
 # Prerequisites
 # -------------
 #   * PyTorch 2.4 or later
-#   * Basic understanding of ``torch._export`` and AOTInductor
+#   * Basic understanding of ``torch.export`` and AOTInductor
 #   * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
 
 ######################################################################
 # What you will learn
 # ----------------------
 # * How to use AOTInductor for python runtime.
-# * How  to use :func:`torch._export.aot_compile` to generate a shared library
+# * How  to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a shared library
 # * How to run a shared library in Python runtime using :func:`torch._export.aot_load`.
 # * When do you use AOTInductor for python runtime
 
@@ -49,7 +49,7 @@
 # ------------
 #
 # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the 
-# exported PyTorch program using :func:`torch._export.aot_compile`.
+# exported PyTorch program using :func:`torch._inductor.aot_compile`.
 #
 #  .. note::
 #
@@ -115,7 +115,7 @@
 #  .. note::
 #
 #      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 
-#      :func:`torch._export.aot_compile`.
+#      :func:`torch.export.export`.
 
 
 import os
@@ -139,13 +139,13 @@
 # model deployment using Python.
 # There are mainly two reasons why you would use AOTInductor Python Runtime:
 #
-# -  ``torch._export.aot_compile`` generates a shared library. This is useful for model
+# -  ``torch._inductor.aot_compile`` generates a shared library. This is useful for model
 #    versioning for deployments and tracking model performance over time.
 # -  With :func:`torch.compile` being a JIT compiler, there is a warmup
 #    cost associated with the first compilation. Your deployment needs to account for the
 #    compilation time taken for the first inference. With AOTInductor, the compilation is
-#    done offline using ``torch._export.aot_compile``. The deployment would only load the
-#    shared library using ``torch._export.aot_load`` and run inference.
+#    done offline using ``torch.export.export`` & ``torch._indutor.aot_compile``. The deployment
+#    would only load the shared library using ``torch._export.aot_load`` and run inference.
 #
 #
 # The section below shows the speedup achieved with AOTInductor for first inference
@@ -218,7 +218,7 @@ def timed(fn):
 # ----------
 #
 # In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by 
-# compiling and loading a pretrained ``ResNet18`` model using the ``torch._export.aot_compile``
+# compiling and loading a pretrained ``ResNet18`` model using the ``torch._inductor.aot_compile``
 # and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
 # generating a shared library and running it within a Python environment, even with dynamic shape
 # considerations and device-specific optimizations. We also looked at the advantage of using 

From 35c5dc892cc9725b3b1fcfb8f8f16d79247bb1fd Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 19 Aug 2024 20:03:39 +0000
Subject: [PATCH 11/21] addressed review comments

---
 recipes_source/torch_export_aoti_python.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index 46179531d04..702d9d67f95 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -79,11 +79,6 @@
         aot_compile_options.update({"max_autotune": True})
     else:
         device = "cpu"
-        # We need to turn off the below optimizations to support batch_size = 16,
-        # which is treated like a special case
-        # https://github.com/pytorch/pytorch/pull/116152
-        torch.backends.mkldnn.set_flags(False)
-        torch.backends.nnpack.set_flags(False)
 
     model = model.to(device=device)
     example_inputs = (torch.randn(2, 3, 224, 224, device=device),)

From 71acd963ca898af95674a5c9ef7f0ecd2f5aa248 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 19 Aug 2024 20:48:31 +0000
Subject: [PATCH 12/21] Moved tutorial to recipe

---
 .jenkins/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 155965abf1b..2f1a9933aab 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -28,7 +28,7 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
-  "intermediate_source/torch_export_aoti_python.py": {
+  "recipes_source/torch_export_aoti_python.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   }, 
   "advanced_source/pendulum.py": {

From 7f5fde997030eb1767319dd36f2cdb0099d1bd1b Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 20 Aug 2024 00:00:31 +0000
Subject: [PATCH 13/21] Change base image to nvidia devel image

---
 .ci/docker/build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 31f42fdbd85..ad1a7395baf 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -11,8 +11,9 @@ IMAGE_NAME="$1"
 shift
 
 export UBUNTU_VERSION="20.04"
+export CUDA_VERSION="12.1.1"
 
-export BASE_IMAGE="ubuntu:${UBUNTU_VERSION}"
+export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
 echo "Building ${IMAGE_NAME} Docker image"
 
 docker build \

From 790f7625f0dd28ee2fcb7271405860bb8ba76f65 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 20 Aug 2024 01:12:10 +0000
Subject: [PATCH 14/21] Change base image to nvidia devel image

---
 .ci/docker/common/common_utils.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
index b20286a4099..c7eabda555d 100644
--- a/.ci/docker/common/common_utils.sh
+++ b/.ci/docker/common/common_utils.sh
@@ -22,5 +22,5 @@ conda_run() {
 }
 
 pip_install() {
-  as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
+  as_ci_user conda run -n py_$ANACONDA_PYTHON_VERSION pip3 install --progress-bar off $*
 }

From 45df5d0c61d919bdbb115a523cdeef165c0412da Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 20 Aug 2024 21:57:24 +0000
Subject: [PATCH 15/21] Update requirements

---
 .ci/docker/requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 00cf2f21033..9668b17fc3a 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -30,8 +30,8 @@ pytorch-lightning
 torchx
 torchrl==0.5.0
 tensordict==0.5.0
-ax-platform>==0.4.0
-nbformat>==5.9.2
+ax-platform>=0.4.0
+nbformat>=5.9.2
 datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
@@ -68,4 +68,4 @@ pygame==2.1.2
 pycocotools
 semilearn==0.3.2
 torchao==0.0.3
-segment_anything==1.0
\ No newline at end of file
+segment_anything==1.0

From b268a3c5669751e1e567100ffe3404331b9b385d Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 20 Aug 2024 23:49:19 +0000
Subject: [PATCH 16/21] fixed formatting

---
 recipes_source/torch_export_aoti_python.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index 702d9d67f95..7f9438bdc17 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -2,7 +2,7 @@
 
 """
 (Beta) ``torch.export`` AOTInductor Tutorial for Python runtime
-===================================================
+===============================================================
 **Author:** Ankith Gunapal, Bin Bao, Angela Yi
 """
 
@@ -46,7 +46,7 @@
 
 ######################################################################
 # Model Compilation
-# ------------
+# -----------------
 #
 # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the 
 # exported PyTorch program using :func:`torch._inductor.aot_compile`.
@@ -101,7 +101,7 @@
 
 ######################################################################
 # Model Inference in Python
-# ------------
+# -------------------------
 #
 # Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
 # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
@@ -127,7 +127,7 @@
 
 ######################################################################
 # When to use AOTInductor for Python Runtime
-# ---------------------------------------
+# ------------------------------------------
 #
 # One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks.
 # Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for

From 6578d82a4b261a857e67ece73ee38545cbb17189 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Tue, 20 Aug 2024 23:58:44 +0000
Subject: [PATCH 17/21] update to CUDA 12.4

---
 .ci/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index ad1a7395baf..c646b8f9a86 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -11,7 +11,7 @@ IMAGE_NAME="$1"
 shift
 
 export UBUNTU_VERSION="20.04"
-export CUDA_VERSION="12.1.1"
+export CUDA_VERSION="12.4.1"
 
 export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
 echo "Building ${IMAGE_NAME} Docker image"

From 67bc0807555194128e1cc621679c84161dbea21a Mon Sep 17 00:00:00 2001
From: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
Date: Wed, 21 Aug 2024 13:23:42 -0700
Subject: [PATCH 18/21] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/torch_export_aoti_python.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index 7f9438bdc17..786dadb369a 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -51,7 +51,7 @@
 # We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the 
 # exported PyTorch program using :func:`torch._inductor.aot_compile`.
 #
-#  .. note::
+# .. note::
 #
 #       This API also supports :func:`torch.compile` options like ``mode``
 #       This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True``
@@ -107,7 +107,7 @@
 # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
 # The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path 
 # of the shared library and the device where it should be loaded.
-#  .. note::
+# .. note::
 #
 #      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 
 #      :func:`torch.export.export`.

From fc0ff5ee18436af5e2831d4bc8d42a96bdb2cd39 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Wed, 21 Aug 2024 22:49:07 +0000
Subject: [PATCH 19/21] addressed review comments for formatting

---
 recipes_source/recipes_index.rst           | 6 ++++++
 recipes_source/torch_export_aoti_python.py | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index d94d7d5c22e..caccdcc28f7 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -150,6 +150,12 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/recipes/swap_tensors.html
    :tags: Basics
 
+.. customcarditem::
+   :header: torch.export AOTInductor Tutorial for Python runtime
+   :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../recipes/torch_export_aoti_python.html
+   :tags: Basics
 
 .. Interpretability
 
diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index 786dadb369a..5a0c1f4f6a8 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -32,9 +32,9 @@
 ######################################################################
 # Prerequisites
 # -------------
-#   * PyTorch 2.4 or later
-#   * Basic understanding of ``torch.export`` and AOTInductor
-#   * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
+# * PyTorch 2.4 or later
+# * Basic understanding of ``torch.export`` and AOTInductor
+# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
 
 ######################################################################
 # What you will learn

From 85f287019f181f3e224e8a5f3699630aa13f78f7 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 22 Aug 2024 08:55:57 -0700
Subject: [PATCH 20/21] Update recipes_source/torch_export_aoti_python.py

---
 recipes_source/torch_export_aoti_python.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index 5a0c1f4f6a8..b3bdd2b7fe2 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -107,6 +107,7 @@
 # we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
 # The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path 
 # of the shared library and the device where it should be loaded.
+#
 # .. note::
 #
 #      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 

From cb8ea23d09d7204b822a8150bd681395c389e45c Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 22 Aug 2024 08:56:16 -0700
Subject: [PATCH 21/21] Update recipes_source/torch_export_aoti_python.py

---
 recipes_source/torch_export_aoti_python.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index b3bdd2b7fe2..136862078c1 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -109,7 +109,6 @@
 # of the shared library and the device where it should be loaded.
 #
 # .. note::
-#
 #      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 
 #      :func:`torch.export.export`.