From 82544b41a07b26ccf559c94c0d4b393150732c78 Mon Sep 17 00:00:00 2001 From: William Wen Date: Mon, 27 Feb 2023 23:35:52 +0000 Subject: [PATCH 1/3] torch.compile tutorial update for pt2 stable release --- .jenkins/validate_tutorials_built.py | 1 - ...tutorial_.py => torch_compile_tutorial.py} | 18 +- .../torch_compile_tutorial.rst | 506 ------------------ 3 files changed, 11 insertions(+), 514 deletions(-) rename intermediate_source/{torch_compile_tutorial_.py => torch_compile_tutorial.py} (98%) delete mode 100644 intermediate_source/torch_compile_tutorial.rst diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index bedb92252e9..94f1c53051d 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -38,7 +38,6 @@ "recipes/profiler_recipe", "recipes/save_load_across_devices", "recipes/warmstarting_model_using_parameters_from_a_different_model", - "torch_compile_tutorial_", "recipes/dynamic_quantization", "recipes/saving_and_loading_a_general_checkpoint", "recipes/benchmark", diff --git a/intermediate_source/torch_compile_tutorial_.py b/intermediate_source/torch_compile_tutorial.py similarity index 98% rename from intermediate_source/torch_compile_tutorial_.py rename to intermediate_source/torch_compile_tutorial.py index 3b7c6884a2a..5ff863f01d5 100644 --- a/intermediate_source/torch_compile_tutorial_.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -28,7 +28,7 @@ # # **Required pip Dependencies** # -# - ``torch >= 1.14`` +# - ``torch >= 2.0`` # - ``torchvision`` # - ``numpy`` # - ``scipy`` @@ -52,9 +52,6 @@ import torch -import torch._inductor.config -torch._inductor.config.cpp.cxx = ("g++",) - def foo(x, y): a = torch.sin(x) b = torch.cos(x) @@ -133,6 +130,11 @@ def evaluate(mod, inp): return mod(inp) model = init_model() + +# Reset since we are using a different mode. +import torch._dynamo +torch._dynamo.reset() + evaluate_opt = torch.compile(evaluate, mode="reduce-overhead") inp = generate_data(16)[0] @@ -175,7 +177,7 @@ def evaluate(mod, inp): ###################################################################### # And indeed, we can see that running our model with ``torch.compile`` # results in a significant speedup. On an NVIDIA A100 GPU, we observe a -# 2.3x speedup. Speedup mainly comes from reducing Python overhead and +# ~1.5x speedup. Speedup mainly comes from reducing Python overhead and # GPU read/writes, and so the observed speedup may vary on factors such as model # architecture and batch size. For example, if a model's architecture is simple # and the amount of data is large, then the bottleneck would be @@ -233,7 +235,7 @@ def train(mod, data): # Again, we can see that ``torch.compile`` takes longer in the first # iteration, as it must compile the model, but afterward, we see # significant speedups compared to eager. On an NVIDIA A100 GPU, we -# observe a 2.2x speedup. +# observe a ~1.8x speedup. ###################################################################### # Comparison to TorchScript and FX Tracing @@ -297,6 +299,9 @@ def test_fns(fn1, fn2, args): # Now we can see that ``torch.compile`` correctly handles # data-dependent control flow. +# Reset since we are using a different mode. +torch._dynamo.reset() + compile_f1 = torch.compile(f1) print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2))) print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2))) @@ -394,7 +399,6 @@ def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]) gm.graph.print_tabular() return gm.forward -import torch._dynamo # Reset since we are using a different backend. torch._dynamo.reset() diff --git a/intermediate_source/torch_compile_tutorial.rst b/intermediate_source/torch_compile_tutorial.rst deleted file mode 100644 index c8b1c6567bc..00000000000 --- a/intermediate_source/torch_compile_tutorial.rst +++ /dev/null @@ -1,506 +0,0 @@ -torch.compile Tutorial -====================== -**Author:** William Wen - -``torch.compile`` is the latest method to speed up your PyTorch code! -``torch.compile`` makes PyTorch code run faster by -JIT-compiling PyTorch code into optimized kernels, -all while requiring minimal code changes. - -In this tutorial, we cover basic ``torch.compile`` usage, -and demonstrate the advantages of ``torch.compile`` over -previous PyTorch compiler solutions, such as -`TorchScript `__ and -`FX Tracing `__. - -**Contents** - -- Basic Usage -- Demonstrating Speedups -- Comparison to TorchScript and FX Tracing -- TorchDynamo and FX Graphs -- Conclusion - -**Required pip Dependencies** - -- ``torch >= 1.14`` -- ``torchvision`` -- ``numpy`` -- ``scipy`` -- ``tabulate`` - -Note: a modern NVIDIA GPU (Volta or Ampere) is recommended for this tutorial. - -Basic Usage ------------- - -``torch.compile`` is included in the latest PyTorch nightlies. -Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly -binary. If Triton is still missing, try installing ``torchtriton`` via pip -(``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"`` -for CUDA 11.7). - -Arbitrary Python functions can be optimized by passing the callable to -``torch.compile``. We can then call the returned optimized -function in place of the original function. - -.. code-block:: python - - import torch - - def foo(x, y): - a = torch.sin(x) - b = torch.cos(x) - return a + b - opt_foo1 = torch.compile(foo) - print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10))) - -Alternatively, we can decorate the function. - -.. code-block:: python - - @torch.compile - def opt_foo2(x, y): - a = torch.sin(x) - b = torch.cos(x) - return a + b - print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10))) - -We can also optimize ``torch.nn.Module`` instances. - -.. code-block:: python - - class MyModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.lin = torch.nn.Linear(100, 10) - - def forward(self, x): - return torch.nn.functional.relu(self.lin(x)) - - mod = MyModule() - opt_mod = torch.compile(mod) - print(opt_mod(torch.randn(10, 100))) - -Demonstrating Speedups ------------------------ - -Let's now demonstrate that using ``torch.compile`` can speed -up real models. We will compare standard eager mode and -``torch.compile`` by evaluating and training ResNet-18 on random data. - -Before we start, we need to define some utility functions. - -.. code-block:: python - - # Returns the result of running `fn()` and the time it took for `fn()` to run, - # in seconds. We use CUDA events and synchronization for the most accurate - # measurements. - def timed(fn): - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - result = fn() - end.record() - torch.cuda.synchronize() - return result, start.elapsed_time(end) / 1000 - - # Generates random input and targets data for the model, where `b` is - # batch size. - def generate_data(b): - return ( - torch.randn(b, 3, 128, 128).to(torch.float32).cuda(), - torch.randint(1000, (b,)).cuda(), - ) - - N_ITERS = 10 - - from torchvision.models import resnet18 - def init_model(): - return resnet18().to(torch.float32).cuda() - -First, let's compare inference. - -Note that in the call to ``torch.compile``, we have have the additional -``mode`` kwarg, which we will discuss below. - -.. code-block:: python - - def evaluate(mod, inp): - return mod(inp) - - model = init_model() - evaluate_opt = torch.compile(evaluate, mode="reduce-overhead") - - inp = generate_data(16)[0] - print("eager:", timed(lambda: evaluate(model, inp))[1]) - print("compile:", timed(lambda: evaluate_opt(model, inp))[1]) - -Notice that ``torch.compile`` takes a lot longer to complete -compared to eager. This is because ``torch.compile`` compiles -the model into optimized kernels as it executes. In our example, the -structure of the model doesn't change, and so recompilation is not -needed. So if we run our optimized model several more times, we should -see a significant improvement compared to eager. - -.. code-block:: python - - eager_times = [] - compile_times = [] - for i in range(N_ITERS): - inp = generate_data(16)[0] - _, eager_time = timed(lambda: evaluate(model, inp)) - eager_times.append(eager_time) - print(f"eager eval time {i}: {eager_time}") - - print("~" * 10) - - compile_times = [] - for i in range(N_ITERS): - inp = generate_data(16)[0] - _, compile_time = timed(lambda: evaluate_opt(model, inp)) - compile_times.append(compile_time) - print(f"compile eval time {i}: {compile_time}") - print("~" * 10) - - import numpy as np - eager_med = np.median(eager_times) - compile_med = np.median(compile_times) - speedup = eager_med / compile_med - print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") - print("~" * 10) - -And indeed, we can see that running our model with ``torch.compile`` -results in a significant speedup. On an NVIDIA A100 GPU, we observe a -2.3x speedup. Speedup mainly comes from reducing Python overhead and -GPU read/writes, and so the observed speedup may vary on factors such as model -architecture and batch size. For example, if a model's architecture is simple -and the amount of data is large, then the bottleneck would be -GPU compute and the observed speedup may be less significant. - -You may also see different speedup results depending on the chosen ``mode`` -kwarg. Since our model and data are small, we want to reduce overhead as -much as possible, and so we chose ``"reduce-overhead"``. For your own models, -you may need to experiment with different modes to maximize speedup. You can -read more about modes `here `__. - -For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed`` -function we defined above. We wrote our own timing function in this tutorial to show -``torch.compile``'s compilation latency. - -Now, let's consider comparing training. - -.. code-block:: python - - model = init_model() - opt = torch.optim.Adam(model.parameters()) - - def train(mod, data): - opt.zero_grad(True) - pred = mod(data[0]) - loss = torch.nn.CrossEntropyLoss()(pred, data[1]) - loss.backward() - opt.step() - - eager_times = [] - for i in range(N_ITERS): - inp = generate_data(16) - _, eager_time = timed(lambda: train(model, inp)) - eager_times.append(eager_time) - print(f"eager train time {i}: {eager_time}") - print("~" * 10) - - model = init_model() - opt = torch.optim.Adam(model.parameters()) - train_opt = torch.compile(train, mode="reduce-overhead") - - compile_times = [] - for i in range(N_ITERS): - inp = generate_data(16) - _, compile_time = timed(lambda: train_opt(model, inp)) - compile_times.append(compile_time) - print(f"compile train time {i}: {compile_time}") - print("~" * 10) - - eager_med = np.median(eager_times) - compile_med = np.median(compile_times) - speedup = eager_med / compile_med - print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") - print("~" * 10) - -Again, we can see that ``torch.compile`` takes longer in the first -iteration, as it must compile the model, but afterward, we see -significant speedups compared to eager. On an NVIDIA A100 GPU, we -observe a 2.2x speedup. - -Comparison to TorchScript and FX Tracing ------------------------------------------ - -We have seen that ``torch.compile`` can speed up PyTorch code. -Why else should we use ``torch.compile`` over existing PyTorch -compiler solutions, such as TorchScript or FX Tracing? Primarily, the -advantage of ``torch.compile`` lies in its ability to handle -arbitrary Python code with minimal changes to existing code. - -One case that ``torch.compile`` can handle that other compiler -solutions struggle with is data-dependent control flow (the -``if x.sum() < 0:`` line below). - -.. code-block:: python - - def f1(x, y): - if x.sum() < 0: - return -y - return y - - # Test that `fn1` and `fn2` return the same result, given - # the same arguments `args`. Typically, `fn1` will be an eager function - # while `fn2` will be a compiled function (torch.compile, TorchScript, or FX graph). - def test_fns(fn1, fn2, args): - out1 = fn1(*args) - out2 = fn2(*args) - return torch.allclose(out1, out2) - - inp1 = torch.randn(5, 5) - inp2 = torch.randn(5, 5) - -TorchScript tracing ``f1`` results in -silently incorrect results, since only the actual control flow path -is traced. - -.. code-block:: python - - traced_f1 = torch.jit.trace(f1, (inp1, inp2)) - print("traced 1, 1:", test_fns(f1, traced_f1, (inp1, inp2))) - print("traced 1, 2:", test_fns(f1, traced_f1, (-inp1, inp2))) - -FX tracing ``f1`` results in an error due to the presence of -data-dependent control flow. - -.. code-block:: python - - import traceback as tb - try: - torch.fx.symbolic_trace(f1) - except: - tb.print_exc() - -If we provide a value for ``x`` as we try to FX trace ``f1``, then -we run into the same problem as TorchScript tracing, as the data-dependent -control flow is removed in the traced function. - -.. code-block:: python - - fx_f1 = torch.fx.symbolic_trace(f1, concrete_args={"x": inp1}) - print("fx 1, 1:", test_fns(f1, fx_f1, (inp1, inp2))) - print("fx 1, 2:", test_fns(f1, fx_f1, (-inp1, inp2))) - -Now we can see that ``torch.compile`` correctly handles -data-dependent control flow. - -.. code-block:: python - - compile_f1 = torch.compile(f1) - print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2))) - print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2))) - print("~" * 10) - -TorchScript scripting can handle data-dependent control flow, but this -solution comes with its own set of problems. Namely, TorchScript scripting -can require major code changes and will raise errors when unsupported Python -is used. - -In the example below, we forget TorchScript type annotations and we receive -a TorchScript error because the input type for argument ``y``, an ``int``, -does not match with the default argument type, ``torch.Tensor``. - -.. code-block:: python - - def f2(x, y): - return x + y - - inp1 = torch.randn(5, 5) - inp2 = 3 - - script_f2 = torch.jit.script(f2) - try: - script_f2(inp1, inp2) - except: - tb.print_exc() - -However, ``torch.compile`` is easily able to handle ``f2``. - -.. code-block:: python - - compile_f2 = torch.compile(f2) - print("compile 2:", test_fns(f2, compile_f2, (inp1, inp2))) - print("~" * 10) - -Another case that ``torch.compile`` handles well compared to -previous compilers solutions is the usage of non-PyTorch functions. - -.. code-block:: python - - import scipy - def f3(x): - x = x * 2 - x = scipy.fft.dct(x.numpy()) - x = torch.from_numpy(x) - x = x * 2 - return x - -TorchScript tracing treats results from non-PyTorch function calls -as constants, and so our results can be silently wrong. - -.. code-block:: python - - inp1 = torch.randn(5, 5) - inp2 = torch.randn(5, 5) - traced_f3 = torch.jit.trace(f3, (inp1,)) - print("traced 3:", test_fns(f3, traced_f3, (inp2,))) - -TorchScript scripting and FX tracing disallow non-PyTorch function calls. - -.. code-block:: python - - try: - torch.jit.script(f3) - except: - tb.print_exc() - - try: - torch.fx.symbolic_trace(f3) - except: - tb.print_exc() - -In comparison, ``torch.compile`` is easily able to handle -the non-PyTorch function call. - -.. code-block:: python - - compile_f3 = torch.compile(f3) - print("compile 3:", test_fns(f3, compile_f3, (inp2,))) - -TorchDynamo and FX Graphs --------------------------- - -One important component of ``torch.compile`` is TorchDynamo. -TorchDynamo is responsible for JIT compiling arbitrary Python code into -`FX graphs `__, which can -then be further optimized. TorchDynamo extracts FX graphs by analyzing Python bytecode -during runtime and detecting calls to PyTorch operations. - -Normally, TorchInductor, another component of ``torch.compile``, -further compiles the FX graphs into optimized kernels, -but TorchDynamo allows for different backends to be used. In order to inspect -the FX graphs that TorchDynamo outputs, let us create a custom backend that -outputs the FX graph and simply returns the graph's unoptimized forward method. - -.. code-block:: python - - from typing import List - def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): - print("custom backend called with FX graph:") - gm.graph.print_tabular() - return gm.forward - - import torch._dynamo - # Reset since we are using a different backend. - torch._dynamo.reset() - - opt_model = torch.compile(init_model(), backend=custom_backend) - opt_model(generate_data(16)[0]) - -Using our custom backend, we can now see how TorchDynamo is able to handle -data-dependent control flow. Consider the function below, where the line -``if b.sum() < 0`` is the source of data-dependent control flow. - -.. code-block:: python - - def bar(a, b): - x = a / (torch.abs(a) + 1) - if b.sum() < 0: - b = b * -1 - return x * b - - opt_bar = torch.compile(bar, backend=custom_backend) - inp1 = torch.randn(10) - inp2 = torch.randn(10) - opt_bar(inp1, inp2) - opt_bar(inp1, -inp2) - -The output reveals that TorchDynamo extracted 3 different FX graphs -corresponding the following code (order may differ from the output above): - -1. ``x = a / (torch.abs(a) + 1)`` -2. ``b = b * -1; return x * b`` -3. ``return x * b`` - -When TorchDynamo encounters unsupported Python features, such as data-dependent -control flow, it breaks the computation graph, lets the default Python -interpreter handle the unsupported code, then resumes capturing the graph. - -Let's investigate by example how TorchDynamo would step through ``bar``. -If ``b.sum() < 0``, then TorchDynamo would run graph 1, let -Python determine the result of the conditional, then run -graph 2. On the other hand, if ``not b.sum() < 0``, then TorchDynamo -would run graph 1, let Python determine the result of the conditional, then -run graph 3. - -This highlights a major difference between TorchDynamo and previous PyTorch -compiler solutions. When encountering unsupported Python features, -previous solutions either raise an error or silently fail. -TorchDynamo, on the other hand, will break the computation graph. - -We can see where TorchDynamo breaks the graph by using ``torch._dynamo.explain``: - -.. code-block:: python - - # Reset since we are using a different backend. - torch._dynamo.reset() - explanation, out_guards, graphs, ops_per_graph, break_reasons, explanation_verbose = torch._dynamo.explain( - bar, torch.randn(10), torch.randn(10) - ) - print(explanation_verbose) - -In order to maximize speedup, graph breaks should be limited. -We can force TorchDynamo to raise an error upon the first graph -break encountered by using ``fullgraph=True``: - -.. code-block:: python - - opt_bar = torch.compile(bar, fullgraph=True) - try: - opt_bar(torch.randn(10), torch.randn(10)) - except: - tb.print_exc() - -And below, we demonstrate that TorchDynamo does not break the graph on -the model we used above for demonstrating speedups. - -.. code-block:: python - - opt_model = torch.compile(init_model(), fullgraph=True) - print(opt_model(generate_data(16)[0])) - -Finally, if we simply want TorchDynamo to output the FX graph for export, -we can use ``torch._dynamo.export``. Note that ``torch._dynamo.export``, like -``fullgraph=True``, raises an error if TorchDynamo breaks the graph. - -.. code-block:: python - - try: - torch._dynamo.export(bar, torch.randn(10), torch.randn(10)) - except: - tb.print_exc() - - model_exp = torch._dynamo.export(init_model(), generate_data(16)[0]) - print(model_exp[0](generate_data(16)[0])) - -Conclusion ------------- - -In this tutorial, we introduced ``torch.compile`` by covering -basic usage, demonstrating speedups over eager mode, comparing to previous -PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions -with FX graphs. We hope that you will give ``torch.compile`` a try! \ No newline at end of file From 060c1dd694d9b326ca6f3d85a333bb6a3f28d7c2 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Wed, 1 Mar 2023 09:45:55 -0800 Subject: [PATCH 2/3] Update torch_compile_tutorial.py --- intermediate_source/torch_compile_tutorial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index 5ff863f01d5..c24feb98cea 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -233,7 +233,7 @@ def train(mod, data): ###################################################################### # Again, we can see that ``torch.compile`` takes longer in the first -# iteration, as it must compile the model, but afterward, we see +# iteration, as it must compile the model, but in subsequent iterations, we see # significant speedups compared to eager. On an NVIDIA A100 GPU, we # observe a ~1.8x speedup. @@ -493,4 +493,4 @@ def bar(a, b): # In this tutorial, we introduced ``torch.compile`` by covering # basic usage, demonstrating speedups over eager mode, comparing to previous # PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions -# with FX graphs. We hope that you will give ``torch.compile`` a try! \ No newline at end of file +# with FX graphs. We hope that you will give ``torch.compile`` a try! From 3a245e99d10259bbbc53d4e3aeee59684482d91b Mon Sep 17 00:00:00 2001 From: William Wen Date: Mon, 6 Mar 2023 19:04:39 +0000 Subject: [PATCH 3/3] remove speedup numbers --- intermediate_source/torch_compile_tutorial.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index c24feb98cea..629b2ab6b41 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -176,8 +176,7 @@ def evaluate(mod, inp): ###################################################################### # And indeed, we can see that running our model with ``torch.compile`` -# results in a significant speedup. On an NVIDIA A100 GPU, we observe a -# ~1.5x speedup. Speedup mainly comes from reducing Python overhead and +# results in a significant speedup. Speedup mainly comes from reducing Python overhead and # GPU read/writes, and so the observed speedup may vary on factors such as model # architecture and batch size. For example, if a model's architecture is simple # and the amount of data is large, then the bottleneck would be @@ -234,8 +233,7 @@ def train(mod, data): ###################################################################### # Again, we can see that ``torch.compile`` takes longer in the first # iteration, as it must compile the model, but in subsequent iterations, we see -# significant speedups compared to eager. On an NVIDIA A100 GPU, we -# observe a ~1.8x speedup. +# significant speedups compared to eager. ###################################################################### # Comparison to TorchScript and FX Tracing