From d7dda238a2b318a9db384defe800fc1b4b23fe11 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 16 Nov 2020 12:09:06 -0800
Subject: [PATCH 1/2] Fix TF32 convergence issue with TF32

---
 .../examples_autograd/two_layer_net_autograd.py        | 10 +++++++++-
 .../examples_autograd/two_layer_net_custom_function.py | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
index 039ca154819..78a4ddf7f77 100755
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ b/beginner_source/examples_autograd/two_layer_net_autograd.py
@@ -18,7 +18,15 @@
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes precision issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
index 7b156feb586..178861a0b2e 100755
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py
@@ -48,7 +48,15 @@ def backward(ctx, grad_output):
 
 dtype = torch.float
 device = torch.device("cpu")
-# device = torch.device("cuda:0") # Uncomment this to run on GPU
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+# torch.backends.cuda.matmul.allow_tf32 = False  # Uncomment this to run on GPU
+
+# The above line disables TensorFloat32. This a feature that allows
+# networks to run at a much faster speed while sacrificing precision.
+# Although TensorFloat32 works well on most real models, for our toy model
+# in this tutorial, the sacrificed precision causes precision issue.
+# For more information, see:
+# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.

From 5e8c11089bc28762f0d4c6f50378213516445584 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 16 Nov 2020 12:13:58 -0800
Subject: [PATCH 2/2] save

---
 beginner_source/examples_autograd/two_layer_net_autograd.py     | 2 +-
 .../examples_autograd/two_layer_net_custom_function.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
index 78a4ddf7f77..ebbc98b2bb8 100755
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ b/beginner_source/examples_autograd/two_layer_net_autograd.py
@@ -24,7 +24,7 @@
 # The above line disables TensorFloat32. This a feature that allows
 # networks to run at a much faster speed while sacrificing precision.
 # Although TensorFloat32 works well on most real models, for our toy model
-# in this tutorial, the sacrificed precision causes precision issue.
+# in this tutorial, the sacrificed precision causes convergence issue.
 # For more information, see:
 # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
index 178861a0b2e..2d2a0875669 100755
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py
@@ -54,7 +54,7 @@ def backward(ctx, grad_output):
 # The above line disables TensorFloat32. This a feature that allows
 # networks to run at a much faster speed while sacrificing precision.
 # Although TensorFloat32 works well on most real models, for our toy model
-# in this tutorial, the sacrificed precision causes precision issue.
+# in this tutorial, the sacrificed precision causes convergence issue.
 # For more information, see:
 # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices