diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
index 039ca154819..ebbc98b2bb8 100755
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ b/beginner_source/examples_autograd/two_layer_net_autograd.py
@@ -18,7 +18,15 @@
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes convergence issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
index 7b156feb586..2d2a0875669 100755
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py
@@ -48,7 +48,15 @@ def backward(ctx, grad_output):
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes convergence issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.