diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py index 5ed9a0d1390..cce52eefdb3 100644 --- a/beginner_source/transformer_tutorial.py +++ b/beginner_source/transformer_tutorial.py @@ -103,15 +103,6 @@ def generate_square_subsequent_mask(sz: int) -> Tensor: # positional encodings have the same dimension as the embeddings so that # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of # different frequencies. -# The ``div_term`` in the code is calculated as -# ``torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))``. -# This calculation is based on the original Transformer paper’s formulation -# for positional encoding. The purpose of this calculation is to create -# a range of values that decrease exponentially. -# This allows the model to learn to attend to positions based on their relative distances. -# The ``math.log(10000.0)`` term in the exponent represents the maximum effective -# input length (in this case, ``10000``). Dividing this term by ``d_model`` scales -# the values to be within a reasonable range for the exponential function. # class PositionalEncoding(nn.Module):