|
28 | 28 | #
|
29 | 29 | # **Required pip Dependencies**
|
30 | 30 | #
|
31 |
| -# - ``torch >= 1.14`` |
| 31 | +# - ``torch >= 2.0`` |
32 | 32 | # - ``torchvision``
|
33 | 33 | # - ``numpy``
|
34 | 34 | # - ``scipy``
|
|
52 | 52 |
|
53 | 53 | import torch
|
54 | 54 |
|
55 |
| -import torch._inductor.config |
56 |
| -torch._inductor.config.cpp.cxx = ("g++",) |
57 |
| - |
58 | 55 | def foo(x, y):
|
59 | 56 | a = torch.sin(x)
|
60 | 57 | b = torch.cos(x)
|
@@ -133,6 +130,11 @@ def evaluate(mod, inp):
|
133 | 130 | return mod(inp)
|
134 | 131 |
|
135 | 132 | model = init_model()
|
| 133 | + |
| 134 | +# Reset since we are using a different mode. |
| 135 | +import torch._dynamo |
| 136 | +torch._dynamo.reset() |
| 137 | + |
136 | 138 | evaluate_opt = torch.compile(evaluate, mode="reduce-overhead")
|
137 | 139 |
|
138 | 140 | inp = generate_data(16)[0]
|
@@ -175,7 +177,7 @@ def evaluate(mod, inp):
|
175 | 177 | ######################################################################
|
176 | 178 | # And indeed, we can see that running our model with ``torch.compile``
|
177 | 179 | # results in a significant speedup. On an NVIDIA A100 GPU, we observe a
|
178 |
| -# 2.3x speedup. Speedup mainly comes from reducing Python overhead and |
| 180 | +# ~1.5x speedup. Speedup mainly comes from reducing Python overhead and |
179 | 181 | # GPU read/writes, and so the observed speedup may vary on factors such as model
|
180 | 182 | # architecture and batch size. For example, if a model's architecture is simple
|
181 | 183 | # and the amount of data is large, then the bottleneck would be
|
@@ -233,7 +235,7 @@ def train(mod, data):
|
233 | 235 | # Again, we can see that ``torch.compile`` takes longer in the first
|
234 | 236 | # iteration, as it must compile the model, but afterward, we see
|
235 | 237 | # significant speedups compared to eager. On an NVIDIA A100 GPU, we
|
236 |
| -# observe a 2.2x speedup. |
| 238 | +# observe a ~1.8x speedup. |
237 | 239 |
|
238 | 240 | ######################################################################
|
239 | 241 | # Comparison to TorchScript and FX Tracing
|
@@ -297,6 +299,9 @@ def test_fns(fn1, fn2, args):
|
297 | 299 | # Now we can see that ``torch.compile`` correctly handles
|
298 | 300 | # data-dependent control flow.
|
299 | 301 |
|
| 302 | +# Reset since we are using a different mode. |
| 303 | +torch._dynamo.reset() |
| 304 | + |
300 | 305 | compile_f1 = torch.compile(f1)
|
301 | 306 | print("compile 1, 1:", test_fns(f1, compile_f1, (inp1, inp2)))
|
302 | 307 | print("compile 1, 2:", test_fns(f1, compile_f1, (-inp1, inp2)))
|
@@ -394,7 +399,6 @@ def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
|
394 | 399 | gm.graph.print_tabular()
|
395 | 400 | return gm.forward
|
396 | 401 |
|
397 |
| -import torch._dynamo |
398 | 402 | # Reset since we are using a different backend.
|
399 | 403 | torch._dynamo.reset()
|
400 | 404 |
|
|
0 commit comments