pytorch · carljparker · Jun 2, 2023 · Jun 1, 2023 · Jun 1, 2023 · Jun 2, 2023
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -103,7 +103,23 @@ def generate_square_subsequent_mask(sz: int) -> Tensor:
 # positional encodings have the same dimension as the embeddings so that
 # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
 # different frequencies.
-#
+# The ``div_term`` in the code is calculated as 
+# ``torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))``. 
+# This calculation is based on the original Transformer paper’s formulation 
+# for positional encoding. The purpose of this calculation is to create 
+# a range of values that decrease exponentially. 
+# This allows the model to learn to attend to positions based on their relative distances.
+# The ``math.log(10000.0)`` term in the exponent represents the maximum effective 
+# input length (in this case, ``10000``). Dividing this term by ``d_model`` scales 
+# the values to be within a reasonable range for the exponential function. 
+# The negative sign in front of the logarithm ensures that the values decrease exponentially.
+# The reason for writing ``math.log(10000.0)`` instead of ``4`` in the code is to make it clear
+# that this value represents the logarithm of the maximum effective input length 
+# (in this case, ``10000``). This makes the code more readable and easier to understand.
+# Using ``math.log(10000.0)`` instead of ``4`` also makes it easier to change the maximum effective 
+# input length if needed. If you want to use a different value for the maximum effective 
+# input length, you can simply change the argument of the ``math.log`` 
+# function instead of recalculating the logarithm manually.
 
 class PositionalEncoding(nn.Module):