wrap in fused kernel scope

drisspg · drisspg · commit 8e581e2bd5a4 · 2023-03-03T02:47:06.000Z
diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py
@@ -228,12 +228,18 @@ def generate_rand_batch(
         seq_len_list,
     )
 
-# Currently the fastpaths don't support NestedTensor for training
 random_nt, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=0.5, dtype=dtype, device=device)
 random_dense, _ = generate_rand_batch(32, 512, embed_dimension, pad_percentage=None, dtype=dtype, device=device)
+
+# Currently the fused implementations don't support NestedTensor for training
 model.eval()
-print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds")
-print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds")
+
+with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+    try:
+        print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds")
+        print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds")
+    except RuntimeError:
+        print("FlashAttention is not supported. See warnings for reasons.")
 
 
 ######################################################################