Update aoti tutorial (#3224)

angelayi · svekars · web-flow · commit 5786e9790e0a · 2025-01-24T12:21:07.000-08:00
Co-authored-by: Svetlana Karslioglu &lt;svekars@meta.com&gt;
diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
@@ -3,7 +3,7 @@
 """
 .. meta::
    :description: An end-to-end example of how to use AOTInductor for Python runtime.
-   :keywords: torch.export, AOTInductor, torch._inductor.aot_compile, torch._export.aot_load
+   :keywords: torch.export, AOTInductor, torch._inductor.aoti_compile_and_package, aot_compile, torch._export.aoti_load_package
 
 ``torch.export`` AOTInductor Tutorial for Python runtime (Beta)
 ===============================================================
@@ -14,19 +14,18 @@
 #
 # .. warning::
 #
-#     ``torch._inductor.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility
-#     breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime.
+#     ``torch._inductor.aoti_compile_and_package`` and
+#     ``torch._inductor.aoti_load_package`` are in Beta status and are subject
+#     to backwards compatibility breaking changes. This tutorial provides an
+#     example of how to use these APIs for model deployment using Python
+#     runtime.
 #
-# It has been shown `previously <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`__ how AOTInductor can be used 
-# to do Ahead-of-Time compilation of PyTorch exported models by creating
-# a shared library that can be run in a non-Python environment.
-#
-#
-# In this tutorial, you will learn an end-to-end example of how to use AOTInductor for Python runtime.
-# We will look at how  to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a 
-# shared library. Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`.
-# You will learn about the speed up seen in the first inference time using AOTInductor, especially when using 
-# ``max-autotune`` mode which can take some time to execute.
+# It has been shown `previously
+# <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`__ how
+# AOTInductor can be used to do Ahead-of-Time compilation of PyTorch exported
+# models by creating an artifact that can be run in a non-Python environment.
+# In this tutorial, you will learn an end-to-end example of how to use
+# AOTInductor for Python runtime.
 #
 # **Contents**
 #
@@ -36,115 +35,169 @@
 ######################################################################
 # Prerequisites
 # -------------
-# * PyTorch 2.4 or later
+# * PyTorch 2.6 or later
 # * Basic understanding of ``torch.export`` and AOTInductor
 # * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models <https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html#>`_ tutorial
 
 ######################################################################
 # What you will learn
 # ----------------------
-# * How to use AOTInductor for python runtime.
-# * How  to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a shared library
-# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`.
-# * When do you use AOTInductor for python runtime
+# * How to use AOTInductor for Python runtime.
+# * How to use :func:`torch._inductor.aoti_compile_and_package` along with :func:`torch.export.export` to generate a compiled artifact
+# * How to load and run the artifact in a Python runtime using :func:`torch._export.aot_load`.
+# * When to you use AOTInductor with a Python runtime
 
 ######################################################################
 # Model Compilation
 # -----------------
 #
-# We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the 
-# exported PyTorch program using :func:`torch._inductor.aot_compile`.
+# We will use the TorchVision pretrained ``ResNet18`` model as an example.
 #
-# .. note::
+# The first step is to export the model to a graph representation using
+# :func:`torch.export.export`. To learn more about using this function, you can
+# check out the `docs <https://pytorch.org/docs/main/export.html>`_ or the
+# `tutorial <https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`_.
 #
-#       This API also supports :func:`torch.compile` options like ``mode``
-#       This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True``
-#       which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default.
+# Once we have exported the PyTorch model and obtained an ``ExportedProgram``,
+# we can apply :func:`torch._inductor.aoti_compile_and_package` to AOTInductor
+# to compile the program to a specified device, and save the generated contents
+# into a ".pt2" artifact.
 #
-# We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is 
-# explained in `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`__
-
+# .. note::
+#
+#       This API supports the same available options that :func:`torch.compile`
+#       has, such as ``mode`` and ``max_autotune`` (for those who want to enable
+#       CUDA graphs and leverage Triton based matrix multiplications and
+#       convolutions)
 
 import os
 import torch
+import torch._inductor
 from torchvision.models import ResNet18_Weights, resnet18
 
 model = resnet18(weights=ResNet18_Weights.DEFAULT)
 model.eval()
 
 with torch.inference_mode():
+    inductor_configs = {}
 
-    # Specify the generated shared library path
-    aot_compile_options = {
-            "aot_inductor.output_path": os.path.join(os.getcwd(), "resnet18_pt2.so"),
-    }
     if torch.cuda.is_available():
         device = "cuda"
-        aot_compile_options.update({"max_autotune": True})
+        inductor_configs["max_autotune"] = True
     else:
         device = "cpu"
 
     model = model.to(device=device)
     example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
 
-    # min=2 is not a bug and is explained in the 0/1 Specialization Problem
-    batch_dim = torch.export.Dim("batch", min=2, max=32)
     exported_program = torch.export.export(
         model,
         example_inputs,
-        # Specify the first dimension of the input x as dynamic
-        dynamic_shapes={"x": {0: batch_dim}},
     )
-    so_path = torch._inductor.aot_compile(
-        exported_program.module(),
-        example_inputs,
-        # Specify the generated shared library path
-        options=aot_compile_options
+    path = torch._inductor.aoti_compile_and_package(
+        exported_program,
+        package_path=os.path.join(os.getcwd(), "resnet18.pt2"),
+        inductor_configs=inductor_configs
     )
 
+######################################################################
+# The result of :func:`aoti_compile_and_package` is an artifact "resnet18.pt2"
+# which can be loaded and executed in Python and C++.
+# 
+# The artifact itself contains a bunch of AOTInductor generated code, such as
+# a generated C++ runner file, a shared library compiled from the C++ file, and
+# CUDA binary files, aka cubin files, if optimizing for CUDA.
+# 
+# Structure-wise, the artifact is a structured ``.zip`` file, with the following 
+# specification:
+#
+# .. code::
+#    .
+#    ├── archive_format
+#    ├── version
+#    ├── data
+#    │   ├── aotinductor
+#    │   │   └── model
+#    │   │       ├── xxx.cpp            # AOTInductor generated cpp file
+#    │   │       ├── xxx.so             # AOTInductor generated shared library
+#    │   │       ├── xxx.cubin          # Cubin files (if running on CUDA)
+#    │   │       └── xxx_metadata.json  # Additional metadata to save
+#    │   ├── weights
+#    │   │  └── TBD
+#    │   └── constants
+#    │      └── TBD
+#    └── extra
+#        └── metadata.json
+#
+# We can use the following command to inspect the artifact contents:
+#
+# .. code:: bash
+#
+#    $ unzip -l resnet18.pt2
+#
+# .. code::
+#
+#    Archive:  resnet18.pt2
+#      Length      Date    Time    Name
+#    ---------  ---------- -----   ----
+#            1  01-08-2025 16:40   version
+#            3  01-08-2025 16:40   archive_format
+#        10088  01-08-2025 16:40   data/aotinductor/model/cagzt6akdaczvxwtbvqe34otfe5jlorktbqlojbzqjqvbfsjlge4.cubin
+#        17160  01-08-2025 16:40   data/aotinductor/model/c6oytfjmt5w4c7onvtm6fray7clirxt7q5xjbwx3hdydclmwoujz.cubin
+#        16616  01-08-2025 16:40   data/aotinductor/model/c7ydp7nocyz323hij4tmlf2kcedmwlyg6r57gaqzcsy3huneamu6.cubin
+#        17776  01-08-2025 16:40   data/aotinductor/model/cyqdf46ordevqhiddvpdpp3uzwatfbzdpl3auj2nx23uxvplnne2.cubin
+#        10856  01-08-2025 16:40   data/aotinductor/model/cpzfebfgrusqslui7fxsuoo4tvwulmrxirc5tmrpa4mvrbdno7kn.cubin
+#        14608  01-08-2025 16:40   data/aotinductor/model/c5ukeoz5wmaszd7vczdz2qhtt6n7tdbl3b6wuy4rb2se24fjwfoy.cubin
+#        11376  01-08-2025 16:40   data/aotinductor/model/csu3nstcp56tsjfycygaqsewpu64l5s6zavvz7537cm4s4cv2k3r.cubin
+#        10984  01-08-2025 16:40   data/aotinductor/model/cp76lez4glmgq7gedf2u25zvvv6rksv5lav4q22dibd2zicbgwj3.cubin
+#        14736  01-08-2025 16:40   data/aotinductor/model/c2bb5p6tnwz4elgujqelsrp3unvkgsyiv7xqxmpvuxcm4jfl7pc2.cubin
+#        11376  01-08-2025 16:40   data/aotinductor/model/c6eopmb2b4ngodwsayae4r5q6ni3jlfogfbdk3ypg56tgpzhubfy.cubin
+#        11624  01-08-2025 16:40   data/aotinductor/model/chmwe6lvoekzfowdbiizitm3haiiuad5kdm6sd2m6mv6dkn2zk32.cubin
+#        15632  01-08-2025 16:40   data/aotinductor/model/c3jop5g344hj3ztsu4qm6ibxyaaerlhkzh2e6emak23rxfje6jam.cubin
+#        25472  01-08-2025 16:40   data/aotinductor/model/chaiixybeiuuitm2nmqnxzijzwgnn2n7uuss4qmsupgblfh3h5hk.cubin
+#       139389  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.cpp
+#           27  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t_metadata.json
+#     47195424  01-08-2025 16:40   data/aotinductor/model/cvk6qzuybruhwxtfblzxiov3rlrziv5fkqc4mdhbmantfu3lmd6t.so
+#    ---------                     -------
+#     47523148                     18 files
+
 
 ######################################################################
 # Model Inference in Python
 # -------------------------
 #
-# Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, 
-# we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime.
-# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path 
-# of the shared library and the device where it should be loaded.
+# To load and run the artifact in Python, we can use :func:`torch._inductor.aoti_load_package`.
 #
-# .. note::
-#      In the example above, we specified ``batch_size=1`` for inference and  it still functions correctly even though we specified ``min=2`` in 
-#      :func:`torch.export.export`.
-
 
 import os
 import torch
+import torch._inductor
 
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_so_path = os.path.join(os.getcwd(), "resnet18_pt2.so")
+model_path = os.path.join(os.getcwd(), "resnet18.pt2")
 
-model = torch._export.aot_load(model_so_path, device)
-example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+compiled_model = torch._inductor.aoti_load_package(model_path)
+example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
-    output = model(example_inputs)
+    output = compiled_model(example_inputs)
+
 
 ######################################################################
-# When to use AOTInductor for Python Runtime
-# ------------------------------------------
+# When to use AOTInductor with a Python Runtime
+# ---------------------------------------------
 #
-# One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks.
-# Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for
-# model deployment using Python.
-# There are mainly two reasons why you would use AOTInductor Python Runtime:
+# There are mainly two reasons why one would use AOTInductor with a Python Runtime:
 #
-# -  ``torch._inductor.aot_compile`` generates a shared library. This is useful for model
-#    versioning for deployments and tracking model performance over time.
+# -  ``torch._inductor.aoti_compile_and_package`` generates a singular
+#    serialized artifact. This is useful for model versioning for deployments
+#    and tracking model performance over time.
 # -  With :func:`torch.compile` being a JIT compiler, there is a warmup
-#    cost associated with the first compilation. Your deployment needs to account for the
-#    compilation time taken for the first inference. With AOTInductor, the compilation is
-#    done offline using ``torch.export.export`` & ``torch._indutor.aot_compile``. The deployment
-#    would only load the shared library using ``torch._export.aot_load`` and run inference.
+#    cost associated with the first compilation. Your deployment needs to
+#    account for the compilation time taken for the first inference. With
+#    AOTInductor, the compilation is done ahead of time using
+#    ``torch.export.export`` and ``torch._inductor.aoti_compile_and_package``.
+#    At deployment time, after loading the model, running inference does not
+#    have any additional cost.
 #
 #
 # The section below shows the speedup achieved with AOTInductor for first inference
@@ -185,7 +238,7 @@ def timed(fn):
 
 torch._dynamo.reset()
 
-model = torch._export.aot_load(model_so_path, device)
+model = torch._inductor.aoti_load_package(model_path)
 example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
@@ -217,8 +270,7 @@ def timed(fn):
 # ----------
 #
 # In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by 
-# compiling and loading a pretrained ``ResNet18`` model using the ``torch._inductor.aot_compile``
-# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of 
-# generating a shared library and running it within a Python environment, even with dynamic shape
-# considerations and device-specific optimizations. We also looked at the advantage of using 
+# compiling and loading a pretrained ``ResNet18`` model. This process
+# demonstrates the practical application of generating a compiled artifact and
+# running it within a Python environment. We also looked at the advantage of using 
 # AOTInductor in model deployments, with regards to speed up in first inference time.