From 0d8c59f0822bffc3b1b3e15d3eeed4e24d2918a0 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Mon, 12 Sep 2022 21:58:28 +0000
Subject: [PATCH 1/2] Update nestedtensor to_padded calls

---
 prototype_source/nestedtensor.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py
index 6ab4891d677..dd7c8cd2c76 100644
--- a/prototype_source/nestedtensor.py
+++ b/prototype_source/nestedtensor.py
@@ -39,7 +39,7 @@
 ######################################################################
 # By padding every underlying tensor to the same shape,
 # a nested tensor can be converted to a regular tensor.
-pt = nt.to_padded_tensor(0.0)
+pt = torch.nested.to_padded_tensor(nt, padding=0.0)
 print(pt)
 
 ######################################################################
@@ -400,9 +400,9 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
 value = torch.nested_tensor(values )
 
 # pad input
-padded_query = query.to_padded_tensor(0.0, (N, L_t, E_q))
-padded_key   = key  .to_padded_tensor(0.0, (N, L_s, E_k))
-padded_value = value.to_padded_tensor(0.0, (N, L_s, E_v))
+padded_query = torch.nested.to_padded_tensor(query, 0.0, (N, L_t, E_q))
+padded_key   = torch.nested.to_padded_tensor(key, 0.0, (N, L_s, E_k))
+padded_value = torch.nested.to_padded_tensor(value, 0.0, (N, L_s, E_v))
 
 # create attention masks
 attn_mask_q = torch.zeros((N, L_t), dtype=torch.bool)
@@ -436,7 +436,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
     dropout_p=dropout_p)
 t2 = timeit.default_timer()
 
-print("nested and padded calculations differ by", (out_nested.to_padded_tensor(0.0, (N, L_t, E_out)) - out_padded).abs().max().item())
+print("nested and padded calculations differ by", (torch.nested.to_padded_tensor(out_nested, 0.0, (N, L_t, E_out)) - out_padded).abs().max().item())
 print("nested tensor multi-head attention takes", t1 - t0, "seconds")
 print("padded tensor multi-head attention takes", t2 - t1, "seconds")
 
@@ -486,7 +486,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
     dropout_p=dropout_p)
 t3 = timeit.default_timer()
 
-print("nested general and library calculations differ by", (out_nested.to_padded_tensor(0.0) - out_lib.to_padded_tensor(0.0)).abs().max().item())
+print("nested general and library calculations differ by", (torch.nested.to_padded_tensor(out_nested, 0.0) - torch.nested.to_padded_tensor(out_lib, 0.0)).abs().max().item())
 print("nested library multi-head attention takes", t1 - t0, "seconds")
 print("nested general multi-head attention takes", t2 - t1, "seconds")
 print("padded tensor multi-head attention takes", t3 - t2, "seconds")

From e6000d06ea3b7e4cd81b5907d78e59ba3c38a691 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 20 Dec 2022 16:08:28 +0000
Subject: [PATCH 2/2] add optimizer to torch.compile part

---
 intermediate_source/torch_compile_tutorial.rst | 16 ++++------------
 intermediate_source/torch_compile_tutorial_.py | 16 ++++------------
 2 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/intermediate_source/torch_compile_tutorial.rst b/intermediate_source/torch_compile_tutorial.rst
index e065df6c2cc..c39a4f9c413 100644
--- a/intermediate_source/torch_compile_tutorial.rst
+++ b/intermediate_source/torch_compile_tutorial.rst
@@ -172,7 +172,7 @@ see a significant improvement compared to eager.
 
 And indeed, we can see that running our model with ``torch.compile``
 results in a significant speedup. On an NVIDIA A100 GPU, we observe a
-2.2x speedup. Speedup mainly comes from reducing Python overhead and
+2.3x speedup. Speedup mainly comes from reducing Python overhead and
 GPU read/writes, and so the observed speedup may vary on factors such as model
 architecture and batch size. For example, if a model's architecture is simple
 and the amount of data is large, then the bottleneck would be
@@ -192,16 +192,16 @@ Now, let's consider comparing training.
     opt = torch.optim.Adam(model.parameters())
 
     def train(mod, data):
+        opt.zero_grad(True)
         pred = mod(data[0])
         loss = torch.nn.CrossEntropyLoss()(pred, data[1])
         loss.backward()
+        opt.step()
 
     eager_times = []
     for i in range(N_ITERS):
         inp = generate_data(16)
-        opt.zero_grad(True)
         _, eager_time = timed(lambda: train(model, inp))
-        opt.step()
         eager_times.append(eager_time)
         print(f"eager train time {i}: {eager_time}")
     print("~" * 10)
@@ -213,9 +213,7 @@ Now, let's consider comparing training.
     compile_times = []
     for i in range(N_ITERS):
         inp = generate_data(16)
-        opt.zero_grad(True)
         _, compile_time = timed(lambda: train_opt(model, inp))
-        opt.step()
         compile_times.append(compile_time)
         print(f"compile train time {i}: {compile_time}")
     print("~" * 10)
@@ -229,13 +227,7 @@ Now, let's consider comparing training.
 Again, we can see that ``torch.compile`` takes longer in the first
 iteration, as it must compile the model, but afterward, we see
 significant speedups compared to eager. On an NVIDIA A100 GPU, we
-observe a 1.8x speedup.
-
-One thing to note is that, as of now, we cannot place optimizer code --
-``opt.zero_grad`` and ``opt.step`` -- inside of an optimized function.
-The rest of the training loop -- the forward pass and the backward pass --
-can be optimized. We are currently working on enabling optimizers to be
-compatible with ``torch.compile``.
+observe a 2.2x speedup.
 
 Comparison to TorchScript and FX Tracing
 -----------------------------------------
diff --git a/intermediate_source/torch_compile_tutorial_.py b/intermediate_source/torch_compile_tutorial_.py
index 2715259e95e..5cd5c40d578 100644
--- a/intermediate_source/torch_compile_tutorial_.py
+++ b/intermediate_source/torch_compile_tutorial_.py
@@ -175,7 +175,7 @@ def evaluate(mod, inp):
 ######################################################################
 # And indeed, we can see that running our model with ``torch.compile``
 # results in a significant speedup. On an NVIDIA A100 GPU, we observe a
-# 2.2x speedup. Speedup mainly comes from reducing Python overhead and
+# 2.3x speedup. Speedup mainly comes from reducing Python overhead and
 # GPU read/writes, and so the observed speedup may vary on factors such as model
 # architecture and batch size. For example, if a model's architecture is simple
 # and the amount of data is large, then the bottleneck would be
@@ -193,16 +193,16 @@ def evaluate(mod, inp):
 opt = torch.optim.Adam(model.parameters())
 
 def train(mod, data):
+    opt.zero_grad(True)
     pred = mod(data[0])
     loss = torch.nn.CrossEntropyLoss()(pred, data[1])
     loss.backward()
+    opt.step()
 
 eager_times = []
 for i in range(N_ITERS):
     inp = generate_data(16)
-    opt.zero_grad(True)
     _, eager_time = timed(lambda: train(model, inp))
-    opt.step()
     eager_times.append(eager_time)
     print(f"eager train time {i}: {eager_time}")
 print("~" * 10)
@@ -214,9 +214,7 @@ def train(mod, data):
 compile_times = []
 for i in range(N_ITERS):
     inp = generate_data(16)
-    opt.zero_grad(True)
     _, compile_time = timed(lambda: train_opt(model, inp))
-    opt.step()
     compile_times.append(compile_time)
     print(f"compile train time {i}: {compile_time}")
 print("~" * 10)
@@ -231,13 +229,7 @@ def train(mod, data):
 # Again, we can see that ``torch.compile`` takes longer in the first
 # iteration, as it must compile the model, but afterward, we see
 # significant speedups compared to eager. On an NVIDIA A100 GPU, we
-# observe a 1.8x speedup.
-#
-# One thing to note is that, as of now, we cannot place optimizer code --
-# ``opt.zero_grad`` and ``opt.step`` -- inside of an optimized function.
-# The rest of the training loop -- the forward pass and the backward pass --
-# can be optimized. We are currently working on enabling optimizers to be
-# compatible with ``torch.compile``.
+# observe a 2.2x speedup.
 
 ######################################################################
 # Comparison to TorchScript and FX Tracing