diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py index 669e516f2c2..2bfeb46b56c 100644 --- a/intermediate_source/scaled_dot_product_attention_tutorial.py +++ b/intermediate_source/scaled_dot_product_attention_tutorial.py @@ -317,7 +317,7 @@ def generate_rand_batch( # on the same set of functions for both modules. # The reason for this here is that ``torch.compile`` is very good at removing the # framework overhead associated with PyTorch. If your model is launching -# large, efficient CUDA kernels, which in this case ``CausaulSelfAttention`` +# large, efficient CUDA kernels, which in this case ``CausalSelfAttention`` # is, then the overhead of PyTorch can be hidden. # # In reality, your module does not normally consist of a singular