refactor_autocast_mechanism (#29)

leslie-fang-intel · web-flow · commit 05543d522f44 · 2021-05-20T16:32:54.000+08:00
diff --git a/intel_pytorch_extension_py/amp/autocast_mode.py b/intel_pytorch_extension_py/amp/autocast_mode.py
@@ -8,12 +8,12 @@
 
 class autocast(object):
     def __init__(self, enabled=True, configure=conf.AmpConf(torch.bfloat16)):
-        supported_dtype = [torch.float32, torch.bfloat16, torch.int8]
+        supported_dtype = [torch.bfloat16, torch.int8]
         if configure.dtype not in supported_dtype :
             warnings.warn("In CPU autocast, but the target dtype is not supported. Disable the autocast.")
-            warnings.warn("Supported dtype input is: torch.float32, torch.bfloat16, torch.int8.")
+            warnings.warn("Supported dtype input is: torch.bfloat16, torch.int8.")
             enabled = False
-            configure = conf.AmpConf(torch.float32)
+            configure = conf.AmpConf(torch.bfloat16)
         self._enabled = enabled
         self._dtype = configure.dtype
 
diff --git a/torch_ipex/csrc/autocast_kernel.cpp b/torch_ipex/csrc/autocast_kernel.cpp
@@ -51,9 +51,9 @@ at::Tensor conv_transpose3d(const at::Tensor& input, const at::Tensor& weight, c
 #if defined(ENABLE_AUTOCAST_VERBOSE)
   verbose::OpNameGuard op_name("conv_transpose3d");
 #endif
-  return at::conv_transpose3d(cpu_cached_cast(at::kFloat, input),
-                              cpu_cached_cast(at::kFloat, weight),
-                              cpu_cached_cast(at::kFloat, bias),
+  return at::conv_transpose3d(cpu_cached_cast(target_type, input),
+                              cpu_cached_cast(target_type, weight),
+                              cpu_cached_cast(target_type, bias),
                               stride, padding, output_padding, groups, dilation);
 }
 
@@ -222,7 +222,7 @@ at::Tensor gelu(const at::Tensor& input) {
     return int8::gelu(input);
   }
   // convert to fp32 path.
-  return at::gelu(cpu_cached_cast(at::kFloat, input));
+  return at::gelu(input);
 }
 
 } // autocast
diff --git a/torch_ipex/csrc/autocast_mode.cpp b/torch_ipex/csrc/autocast_mode.cpp
@@ -15,16 +15,16 @@ thread_local std::unordered_map<c10::TensorImpl *, val_type> cached_casts;
 
 thread_local int nesting = 0;
 
-thread_local at::ScalarType current_target_dtype = at::kFloat;
+thread_local at::ScalarType current_target_dtype = at::kBFloat16;
 }  // namespace
 
 bool is_autocast_enabled() {
-  return c10::impl::tls_is_dispatch_key_included(c10::DispatchKey::AutocastCPU);
+  return !c10::impl::tls_is_dispatch_key_excluded(c10::DispatchKey::AutocastCPU);
 }
 
 void set_autocast_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_included(c10::DispatchKey::AutocastCPU,
-                                           new_enabled);
+  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU,
+                                           !new_enabled);
 }
 
 at::ScalarType get_autocast_dtype() {
@@ -176,23 +176,14 @@ MAKE_REGISTER_FUNC(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), us
 MAKE_REGISTER_FUNC(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), user_defined_dtype)
 MAKE_REGISTER_FUNC(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), user_defined_dtype)
 MAKE_REGISTER_FUNC(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), user_defined_dtype)
+MAKE_REGISTER_FUNC(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&,
+                                                                        IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), user_defined_dtype)
+MAKE_REGISTER_FUNC(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&,
+                                                                              IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), user_defined_dtype)
 
 // fp32 cast policy
-MAKE_REGISTER_FUNC(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest1d), "upsample_nearest1d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest1d), "upsample_nearest1d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest2d), "upsample_nearest2d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest2d), "upsample_nearest2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest3d), "upsample_nearest3d", Tensor (const Tensor &, IntArrayRef, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_nearest3d), "upsample_nearest3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, c10::optional<ArrayRef<double>>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_linear1d), "upsample_linear1d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_linear1d), "upsample_linear1d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
-MAKE_REGISTER_FUNC(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, int64_t), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, int64_t), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(pow), "pow.Tensor_Scalar", Tensor (const Tensor &, const Scalar &), fp32)
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
@@ -77,7 +77,6 @@ void InitIpexModuleBindings(py::module m) {
   m.def("disable_jit_opt", []() { AutoOptConfig::singleton().set_jit_fuse(false); });
   m.def("get_jit_opt", []() { return AutoOptConfig::singleton().get_jit_fuse(); });
 
-
   // int8 path
   m.def("clear_autocast_cache_int8", &torch_ipex::autocast::int8::clear_autocast_cache_int8);
   m.def("enable_int8_calibration", []() { AutoOptConfig::singleton().set_int8_calibration(true); });
diff --git a/torch_patches/autocast.patch b/torch_patches/autocast.patch
@@ -1,12 +1,50 @@
 diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
-index b32f991df3..99bf28b380 100644
+index ff6a84ebbe..b3d3153169 100644
 --- a/c10/core/DispatchKey.h
 +++ b/c10/core/DispatchKey.h
-@@ -227,6 +227,7 @@ enum class DispatchKey : uint8_t {
+@@ -228,7 +228,7 @@ enum class DispatchKey : uint8_t {
+ 
    // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
    // and inputs are saved for backward in the post-autocast type.
-   Autocast,
+-  // AutocastCPU,
 +  AutocastCPU,
+   AutocastCUDA,
  
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-   // There are a number of alternative modes which may want to handle before
+diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
+index 272cf33118..8358e931f0 100644
+--- a/c10/core/DispatchKeySet.cpp
++++ b/c10/core/DispatchKeySet.cpp
+@@ -78,8 +78,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
+ 
+ DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) {
+   switch (t) {
+-    // case DispatchKey::CPU:
+-    //  return DispatchKeySet(DispatchKey::AutocastCPU);
++    case DispatchKey::CPU:
++      return DispatchKeySet(DispatchKey::AutocastCPU);
+     case DispatchKey::CUDA:
+       return DispatchKeySet(DispatchKey::AutocastCUDA);
+     default:
+diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
+index 223355203c..e11572f23a 100644
+--- a/c10/core/DispatchKeySet.h
++++ b/c10/core/DispatchKeySet.h
+@@ -223,7 +223,7 @@ constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
+ });
+ 
+ constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
+-    // DispatchKey::AutocastCPU,
++    DispatchKey::AutocastCPU,
+     DispatchKey::AutocastCUDA,
+ });
+ 
+@@ -234,7 +234,7 @@ constexpr DispatchKeySet default_included_set = DispatchKeySet({
+ });
+ 
+ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
+-    // DispatchKey::AutocastCPU,
++    DispatchKey::AutocastCPU,
+     DispatchKey::AutocastCUDA,
+ });
+ 

Original file line number	Diff line number	Diff line change
`@@ -51,9 +51,9 @@ at::Tensor conv_transpose3d(const at::Tensor& input, const at::Tensor& weight, c`
`51`	`51`	`#if defined(ENABLE_AUTOCAST_VERBOSE)`
`52`	`52`	`verbose::OpNameGuard op_name("conv_transpose3d");`
`53`	`53`	`#endif`
`54`		`- return at::conv_transpose3d(cpu_cached_cast(at::kFloat, input),`
`55`		`- cpu_cached_cast(at::kFloat, weight),`
`56`		`- cpu_cached_cast(at::kFloat, bias),`
	`54`	`+ return at::conv_transpose3d(cpu_cached_cast(target_type, input),`
	`55`	`+ cpu_cached_cast(target_type, weight),`
	`56`	`+ cpu_cached_cast(target_type, bias),`
`57`	`57`	`stride, padding, output_padding, groups, dilation);`
`58`	`58`	`}`
`59`	`59`
`@@ -222,7 +222,7 @@ at::Tensor gelu(const at::Tensor& input) {`
`222`	`222`	`return int8::gelu(input);`
`223`	`223`	`}`
`224`	`224`	`// convert to fp32 path.`
`225`		`- return at::gelu(cpu_cached_cast(at::kFloat, input));`
	`225`	`+ return at::gelu(input);`
`226`	`226`	`}`
`227`	`227`
`228`	`228`	`} // autocast`