From 0d8c59f0822bffc3b1b3e15d3eeed4e24d2918a0 Mon Sep 17 00:00:00 2001 From: Driss Guessous Date: Mon, 12 Sep 2022 21:58:28 +0000 Subject: [PATCH 1/2] Update nestedtensor to_padded calls --- prototype_source/nestedtensor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 6ab4891d677..dd7c8cd2c76 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -39,7 +39,7 @@ ###################################################################### # By padding every underlying tensor to the same shape, # a nested tensor can be converted to a regular tensor. -pt = nt.to_padded_tensor(0.0) +pt = torch.nested.to_padded_tensor(nt, padding=0.0) print(pt) ###################################################################### @@ -400,9 +400,9 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: value = torch.nested_tensor(values ) # pad input -padded_query = query.to_padded_tensor(0.0, (N, L_t, E_q)) -padded_key = key .to_padded_tensor(0.0, (N, L_s, E_k)) -padded_value = value.to_padded_tensor(0.0, (N, L_s, E_v)) +padded_query = torch.nested.to_padded_tensor(query, 0.0, (N, L_t, E_q)) +padded_key = torch.nested.to_padded_tensor(key, 0.0, (N, L_s, E_k)) +padded_value = torch.nested.to_padded_tensor(value, 0.0, (N, L_s, E_v)) # create attention masks attn_mask_q = torch.zeros((N, L_t), dtype=torch.bool) @@ -436,7 +436,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: dropout_p=dropout_p) t2 = timeit.default_timer() -print("nested and padded calculations differ by", (out_nested.to_padded_tensor(0.0, (N, L_t, E_out)) - out_padded).abs().max().item()) +print("nested and padded calculations differ by", (torch.nested.to_padded_tensor(out_nested, 0.0, (N, L_t, E_out)) - out_padded).abs().max().item()) print("nested tensor multi-head attention takes", t1 - t0, "seconds") print("padded tensor multi-head attention takes", t2 - t1, "seconds") @@ -486,7 +486,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: dropout_p=dropout_p) t3 = timeit.default_timer() -print("nested general and library calculations differ by", (out_nested.to_padded_tensor(0.0) - out_lib.to_padded_tensor(0.0)).abs().max().item()) +print("nested general and library calculations differ by", (torch.nested.to_padded_tensor(out_nested, 0.0) - torch.nested.to_padded_tensor(out_lib, 0.0)).abs().max().item()) print("nested library multi-head attention takes", t1 - t0, "seconds") print("nested general multi-head attention takes", t2 - t1, "seconds") print("padded tensor multi-head attention takes", t3 - t2, "seconds") From e6000d06ea3b7e4cd81b5907d78e59ba3c38a691 Mon Sep 17 00:00:00 2001 From: William Wen Date: Tue, 20 Dec 2022 16:08:28 +0000 Subject: [PATCH 2/2] add optimizer to torch.compile part --- intermediate_source/torch_compile_tutorial.rst | 16 ++++------------ intermediate_source/torch_compile_tutorial_.py | 16 ++++------------ 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/intermediate_source/torch_compile_tutorial.rst b/intermediate_source/torch_compile_tutorial.rst index e065df6c2cc..c39a4f9c413 100644 --- a/intermediate_source/torch_compile_tutorial.rst +++ b/intermediate_source/torch_compile_tutorial.rst @@ -172,7 +172,7 @@ see a significant improvement compared to eager. And indeed, we can see that running our model with ``torch.compile`` results in a significant speedup. On an NVIDIA A100 GPU, we observe a -2.2x speedup. Speedup mainly comes from reducing Python overhead and +2.3x speedup. Speedup mainly comes from reducing Python overhead and GPU read/writes, and so the observed speedup may vary on factors such as model architecture and batch size. For example, if a model's architecture is simple and the amount of data is large, then the bottleneck would be @@ -192,16 +192,16 @@ Now, let's consider comparing training. opt = torch.optim.Adam(model.parameters()) def train(mod, data): + opt.zero_grad(True) pred = mod(data[0]) loss = torch.nn.CrossEntropyLoss()(pred, data[1]) loss.backward() + opt.step() eager_times = [] for i in range(N_ITERS): inp = generate_data(16) - opt.zero_grad(True) _, eager_time = timed(lambda: train(model, inp)) - opt.step() eager_times.append(eager_time) print(f"eager train time {i}: {eager_time}") print("~" * 10) @@ -213,9 +213,7 @@ Now, let's consider comparing training. compile_times = [] for i in range(N_ITERS): inp = generate_data(16) - opt.zero_grad(True) _, compile_time = timed(lambda: train_opt(model, inp)) - opt.step() compile_times.append(compile_time) print(f"compile train time {i}: {compile_time}") print("~" * 10) @@ -229,13 +227,7 @@ Now, let's consider comparing training. Again, we can see that ``torch.compile`` takes longer in the first iteration, as it must compile the model, but afterward, we see significant speedups compared to eager. On an NVIDIA A100 GPU, we -observe a 1.8x speedup. - -One thing to note is that, as of now, we cannot place optimizer code -- -``opt.zero_grad`` and ``opt.step`` -- inside of an optimized function. -The rest of the training loop -- the forward pass and the backward pass -- -can be optimized. We are currently working on enabling optimizers to be -compatible with ``torch.compile``. +observe a 2.2x speedup. Comparison to TorchScript and FX Tracing ----------------------------------------- diff --git a/intermediate_source/torch_compile_tutorial_.py b/intermediate_source/torch_compile_tutorial_.py index 2715259e95e..5cd5c40d578 100644 --- a/intermediate_source/torch_compile_tutorial_.py +++ b/intermediate_source/torch_compile_tutorial_.py @@ -175,7 +175,7 @@ def evaluate(mod, inp): ###################################################################### # And indeed, we can see that running our model with ``torch.compile`` # results in a significant speedup. On an NVIDIA A100 GPU, we observe a -# 2.2x speedup. Speedup mainly comes from reducing Python overhead and +# 2.3x speedup. Speedup mainly comes from reducing Python overhead and # GPU read/writes, and so the observed speedup may vary on factors such as model # architecture and batch size. For example, if a model's architecture is simple # and the amount of data is large, then the bottleneck would be @@ -193,16 +193,16 @@ def evaluate(mod, inp): opt = torch.optim.Adam(model.parameters()) def train(mod, data): + opt.zero_grad(True) pred = mod(data[0]) loss = torch.nn.CrossEntropyLoss()(pred, data[1]) loss.backward() + opt.step() eager_times = [] for i in range(N_ITERS): inp = generate_data(16) - opt.zero_grad(True) _, eager_time = timed(lambda: train(model, inp)) - opt.step() eager_times.append(eager_time) print(f"eager train time {i}: {eager_time}") print("~" * 10) @@ -214,9 +214,7 @@ def train(mod, data): compile_times = [] for i in range(N_ITERS): inp = generate_data(16) - opt.zero_grad(True) _, compile_time = timed(lambda: train_opt(model, inp)) - opt.step() compile_times.append(compile_time) print(f"compile train time {i}: {compile_time}") print("~" * 10) @@ -231,13 +229,7 @@ def train(mod, data): # Again, we can see that ``torch.compile`` takes longer in the first # iteration, as it must compile the model, but afterward, we see # significant speedups compared to eager. On an NVIDIA A100 GPU, we -# observe a 1.8x speedup. -# -# One thing to note is that, as of now, we cannot place optimizer code -- -# ``opt.zero_grad`` and ``opt.step`` -- inside of an optimized function. -# The rest of the training loop -- the forward pass and the backward pass -- -# can be optimized. We are currently working on enabling optimizers to be -# compatible with ``torch.compile``. +# observe a 2.2x speedup. ###################################################################### # Comparison to TorchScript and FX Tracing