make some updates to torch.compile mode and explain why 2nd run is slower

williamwen42 · williamwen42 · commit a899070c0f37 · 2023-10-25T10:37:42.000-07:00
diff --git a/intermediate_source/ipex_test.py b/intermediate_source/ipex_test.py
@@ -0,0 +1,13 @@
+import torch
+import torchvision.models as models
+model = models.resnet50(weights='ResNet50_Weights.DEFAULT')
+model.eval()
+data = torch.rand(1, 3, 224, 224)
+#################### code changes ####################
+import intel_extension_for_pytorch as ipex
+# Invoke the following API optionally, to apply frontend optimizations
+model = ipex.optimize(model, weights_prepack=False)
+compile_model = torch.compile(model, backend="ipex")
+######################################################
+with torch.no_grad():
+    print(compile_model(data))
diff --git a/intermediate_source/local_test.py b/intermediate_source/local_test.py
@@ -0,0 +1,20 @@
+import torch
+from torch.export import dynamic_dim, export
+
+def fn(x, y):
+    z = x.clone()
+    z.copy_(y)
+    return z
+
+inp1 = torch.randn(10, 10)
+inp2 = torch.randn(1, 10)
+constraints = (
+    [dynamic_dim(inp1, i) for i in range(inp1.dim())] +
+    [dynamic_dim(inp2, i) for i in range(inp2.dim())]
+)
+exp1 = export(fn, (inp1, inp2))
+# exp1 = export(fn, (inp1, inp2), constraints=constraints)
+exp1.graph_module.print_readable()
+# exp(torch.randn(10, 10), torch.randn(10, 10))
+exp2 = export(fn, (torch.randn(10, 10), torch.randn(10, 10)))
+exp2.graph_module.print_readable()
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
@@ -195,11 +195,15 @@ def init_model():
 # GPU compute and the observed speedup may be less significant.
 #
 # You may also see different speedup results depending on the chosen ``mode``
-# argument. Since our model and data are small, we want to reduce overhead as
-# much as possible, and so we chose ``"reduce-overhead"``. For your own models,
+# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce
+# the overhead of Python. For your own models,
 # you may need to experiment with different modes to maximize speedup. You can
 # read more about modes `here <https://pytorch.org/get-started/pytorch-2.0/#user-experience>`__.
 #
+# You may might also notice that the second time we run our model with ``torch.compile`` is significantly
+# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"``
+# mode runs a few warm-up iterations for CUDA graphs.
+#
 # For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed``
 # function we defined above. We wrote our own timing function in this tutorial to show
 # ``torch.compile``'s compilation latency.