28
28
#
29
29
# **Required pip Dependencies**
30
30
#
31
- # - ``torch >= 1.14 ``
31
+ # - ``torch >= 2.0 ``
32
32
# - ``torchvision``
33
33
# - ``numpy``
34
34
# - ``scipy``
52
52
53
53
import torch
54
54
55
- import torch ._inductor .config
56
- torch ._inductor .config .cpp .cxx = ("g++" ,)
57
-
58
55
def foo (x , y ):
59
56
a = torch .sin (x )
60
57
b = torch .cos (x )
@@ -133,6 +130,11 @@ def evaluate(mod, inp):
133
130
return mod (inp )
134
131
135
132
model = init_model ()
133
+
134
+ # Reset since we are using a different mode.
135
+ import torch ._dynamo
136
+ torch ._dynamo .reset ()
137
+
136
138
evaluate_opt = torch .compile (evaluate , mode = "reduce-overhead" )
137
139
138
140
inp = generate_data (16 )[0 ]
@@ -174,8 +176,7 @@ def evaluate(mod, inp):
174
176
175
177
######################################################################
176
178
# And indeed, we can see that running our model with ``torch.compile``
177
- # results in a significant speedup. On an NVIDIA A100 GPU, we observe a
178
- # 2.3x speedup. Speedup mainly comes from reducing Python overhead and
179
+ # results in a significant speedup. Speedup mainly comes from reducing Python overhead and
179
180
# GPU read/writes, and so the observed speedup may vary on factors such as model
180
181
# architecture and batch size. For example, if a model's architecture is simple
181
182
# and the amount of data is large, then the bottleneck would be
@@ -231,9 +232,8 @@ def train(mod, data):
231
232
232
233
######################################################################
233
234
# Again, we can see that ``torch.compile`` takes longer in the first
234
- # iteration, as it must compile the model, but afterward, we see
235
- # significant speedups compared to eager. On an NVIDIA A100 GPU, we
236
- # observe a 2.2x speedup.
235
+ # iteration, as it must compile the model, but in subsequent iterations, we see
236
+ # significant speedups compared to eager.
237
237
238
238
######################################################################
239
239
# Comparison to TorchScript and FX Tracing
@@ -297,6 +297,9 @@ def test_fns(fn1, fn2, args):
297
297
# Now we can see that ``torch.compile`` correctly handles
298
298
# data-dependent control flow.
299
299
300
+ # Reset since we are using a different mode.
301
+ torch ._dynamo .reset ()
302
+
300
303
compile_f1 = torch .compile (f1 )
301
304
print ("compile 1, 1:" , test_fns (f1 , compile_f1 , (inp1 , inp2 )))
302
305
print ("compile 1, 2:" , test_fns (f1 , compile_f1 , (- inp1 , inp2 )))
@@ -394,7 +397,6 @@ def custom_backend(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
394
397
gm .graph .print_tabular ()
395
398
return gm .forward
396
399
397
- import torch ._dynamo
398
400
# Reset since we are using a different backend.
399
401
torch ._dynamo .reset ()
400
402
@@ -489,4 +491,4 @@ def bar(a, b):
489
491
# In this tutorial, we introduced ``torch.compile`` by covering
490
492
# basic usage, demonstrating speedups over eager mode, comparing to previous
491
493
# PyTorch compiler solutions, and briefly investigating TorchDynamo and its interactions
492
- # with FX graphs. We hope that you will give ``torch.compile`` a try!
494
+ # with FX graphs. We hope that you will give ``torch.compile`` a try!
0 commit comments