Liangan1/embedding bf16 enable (#36)

liangan1 · web-flow · commit fcab70dbda46 · 2021-05-26T14:21:22.000+08:00
* Enable BF16 embedding convert type in utils.py

Backgroud: the Embedding input index is long type and the output is FP32 by default even BF16 embedding table is enabled.
           For residual block in BERT, the embedding FP32 result will add to the linear BF16 output and get FP32 output
           which means many dtype convert will be introduced and only partial ops can use BF16.

* Add Layernorm to white list of autocast
Backgroud: LayerNorm is fallthroup op by defualt and weight/bias is FP32. When input is BF16 there will be dtype error.

* 1) Add LayerNorm to module convert dtype list.
   2) Refine code
diff --git a/intel_pytorch_extension_py/utils.py b/intel_pytorch_extension_py/utils.py
@@ -33,12 +33,19 @@ def _replace_lstm_with_ipex_lstm(model):
                 _replace_lstm_with_ipex_lstm(child)
 
 def convert_module_data_type(module, dtype):
-    if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
-        weight_data = module.weight.detach().clone().to(dtype)
-        module.weight.data = weight_data
-        if module.bias is not None:
-            bias_data = module.bias.detach().clone().to(dtype)
-            module.bias.data = bias_data
+    # convert weights(bias) of module to dtype to reduce dtype reorder
+    module_convert_list = [torch.nn.Conv2d,
+                          torch.nn.Linear,
+                          torch.nn.Embedding,
+                          torch.nn.LayerNorm]
+    for module_cls in module_convert_list:
+        if isinstance(module, module_cls):
+            weight_data = module.weight.detach().clone().to(dtype)
+            module.weight.data = weight_data
+            if hasattr(module, 'bias') and module.bias is not None:
+                bias_data = module.bias.detach().clone().to(dtype)
+                module.bias.data = bias_data
+            break 
     for child in module.children():
         convert_module_data_type(child, dtype)
     return module
diff --git a/torch_ipex/csrc/autocast_mode.cpp b/torch_ipex/csrc/autocast_mode.cpp
@@ -180,6 +180,7 @@ MAKE_REGISTER_FUNC(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const T
                                                                         IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), user_defined_dtype)
 MAKE_REGISTER_FUNC(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&,
                                                                               IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), user_defined_dtype)
+MAKE_REGISTER_FUNC(ADD_NS(layer_norm), "layer_norm", Tensor (const Tensor &, IntArrayRef, const c10::optional<Tensor>&, const c10::optional<Tensor>&, double, bool), user_defined_dtype)
 
 // fp32 cast policy
 MAKE_REGISTER_FUNC(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)