torch.compile tutorial update for pt2 stable release

williamwen42 · williamwen42 · commit ed3f8a5f4767 · 2023-02-27T23:35:52.000Z
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -39,7 +39,6 @@
     "recipes/profiler_recipe",
     "recipes/save_load_across_devices",
     "recipes/warmstarting_model_using_parameters_from_a_different_model",
-    "torch_compile_tutorial_",
     "recipes/dynamic_quantization",
     "recipes/saving_and_loading_a_general_checkpoint",
     "recipes/benchmark",
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
@@ -28,7 +28,7 @@
 #
 # **Required pip Dependencies**
 #
-# - ``torch >= 1.14``
+# - ``torch >= 2.0``
 # - ``torchvision``
 # - ``numpy``
 # - ``scipy``
@@ -52,9 +52,6 @@
 
 import torch
 
-import torch._inductor.config
-torch._inductor.config.cpp.cxx = ("g++",)
-
 def foo(x, y):
     a = torch.sin(x)
     b = torch.cos(x)
@@ -133,6 +130,11 @@ def evaluate(mod, inp):
     return mod(inp)
 
 model = init_model()
+
+# Reset since we are using a different mode.
+import torch._dynamo
+torch._dynamo.reset()
+
 evaluate_opt = torch.compile(evaluate, mode="reduce-overhead")
 
 inp = generate_data(16)[0]
@@ -175,7 +177,7 @@ def evaluate(mod, inp):
 ######################################################################
 # And indeed, we can see that running our model with ``torch.compile``
 # results in a significant speedup. On an NVIDIA A100 GPU, we observe a
-# 2.3x speedup. Speedup mainly comes from reducing Python overhead and
+# ~1.5x speedup. Speedup mainly comes from reducing Python overhead and
 # GPU read/writes, and so the observed speedup may vary on factors such as model
 # architecture and batch size. For example, if a model's architecture is simple
 # and the amount of data is large, then the bottleneck would be
@@ -233,7 +235,7 @@ def train(mod, data):
 # Again, we can see that ``torch.compile`` takes longer in the first
 # iteration, as it must compile the model, but afterward, we see
 # significant speedups compared to eager. On an NVIDIA A100 GPU, we
-# observe a 2.2x speedup.
+# observe a ~1.8x speedup.
 
 ######################################################################
 # Comparison to TorchScript and FX Tracing
@@ -297,6 +299,9 @@ def test_fns(fn1, fn2, args):
 # Now we can see that ``torch.compile`` correctly handles
 # data-dependent control flow.
 
+# Reset since we are using a different mode.
+torch._dynamo.reset()
+
 compile_f1 = torch.compile(f1)
 print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2)))
 print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2)))
@@ -394,7 +399,6 @@ def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
     gm.graph.print_tabular()
     return gm.forward
 
-import torch._dynamo
 # Reset since we are using a different backend.
 torch._dynamo.reset()
 
diff --git a/intermediate_source/torch_compile_tutorial.rst b/intermediate_source/torch_compile_tutorial.rst