diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index 0ef64cdbc51..5de08bc4dda 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -195,11 +195,15 @@ def init_model(): # GPU compute and the observed speedup may be less significant. # # You may also see different speedup results depending on the chosen ``mode`` -# argument. Since our model and data are small, we want to reduce overhead as -# much as possible, and so we chose ``"reduce-overhead"``. For your own models, +# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce +# the overhead of Python. For your own models, # you may need to experiment with different modes to maximize speedup. You can # read more about modes `here `__. # +# You may might also notice that the second time we run our model with ``torch.compile`` is significantly +# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"`` +# mode runs a few warm-up iterations for CUDA graphs. +# # For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed`` # function we defined above. We wrote our own timing function in this tutorial to show # ``torch.compile``'s compilation latency.