From d7dda238a2b318a9db384defe800fc1b4b23fe11 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 16 Nov 2020 12:09:06 -0800 Subject: [PATCH 1/2] Fix TF32 convergence issue with TF32 --- .../examples_autograd/two_layer_net_autograd.py | 10 +++++++++- .../examples_autograd/two_layer_net_custom_function.py | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py index 039ca154819..78a4ddf7f77 100755 --- a/beginner_source/examples_autograd/two_layer_net_autograd.py +++ b/beginner_source/examples_autograd/two_layer_net_autograd.py @@ -18,7 +18,15 @@ dtype = torch.float device = torch.device("cpu") -# device = torch.device("cuda:0") # Uncomment this to run on GPU +# device = torch.device("cuda:0") # Uncomment this to run on GPU +# torch.backends.cuda.matmul.allow_tf32 = False # Uncomment this to run on GPU + +# The above line disables TensorFloat32. This a feature that allows +# networks to run at a much faster speed while sacrificing precision. +# Although TensorFloat32 works well on most real models, for our toy model +# in this tutorial, the sacrificed precision causes precision issue. +# For more information, see: +# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py index 7b156feb586..178861a0b2e 100755 --- a/beginner_source/examples_autograd/two_layer_net_custom_function.py +++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py @@ -48,7 +48,15 @@ def backward(ctx, grad_output): dtype = torch.float device = torch.device("cpu") -# device = torch.device("cuda:0") # Uncomment this to run on GPU +# device = torch.device("cuda:0") # Uncomment this to run on GPU +# torch.backends.cuda.matmul.allow_tf32 = False # Uncomment this to run on GPU + +# The above line disables TensorFloat32. This a feature that allows +# networks to run at a much faster speed while sacrificing precision. +# Although TensorFloat32 works well on most real models, for our toy model +# in this tutorial, the sacrificed precision causes precision issue. +# For more information, see: +# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. From 5e8c11089bc28762f0d4c6f50378213516445584 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 16 Nov 2020 12:13:58 -0800 Subject: [PATCH 2/2] save --- beginner_source/examples_autograd/two_layer_net_autograd.py | 2 +- .../examples_autograd/two_layer_net_custom_function.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py index 78a4ddf7f77..ebbc98b2bb8 100755 --- a/beginner_source/examples_autograd/two_layer_net_autograd.py +++ b/beginner_source/examples_autograd/two_layer_net_autograd.py @@ -24,7 +24,7 @@ # The above line disables TensorFloat32. This a feature that allows # networks to run at a much faster speed while sacrificing precision. # Although TensorFloat32 works well on most real models, for our toy model -# in this tutorial, the sacrificed precision causes precision issue. +# in this tutorial, the sacrificed precision causes convergence issue. # For more information, see: # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py index 178861a0b2e..2d2a0875669 100755 --- a/beginner_source/examples_autograd/two_layer_net_custom_function.py +++ b/beginner_source/examples_autograd/two_layer_net_custom_function.py @@ -54,7 +54,7 @@ def backward(ctx, grad_output): # The above line disables TensorFloat32. This a feature that allows # networks to run at a much faster speed while sacrificing precision. # Although TensorFloat32 works well on most real models, for our toy model -# in this tutorial, the sacrificed precision causes precision issue. +# in this tutorial, the sacrificed precision causes convergence issue. # For more information, see: # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices