Enable lstm bf16 and fp32 in cpu device (#24)

yanbing-j · chunyuan-w · web-flow · commit ae3c00adfbe4 · 2021-05-24T12:48:29.000+08:00
* enable fp32 lstm in cpu device

* lstm enable bf16

* Implement unit test

* add gather into black list

* Remove unnecessary lines and move test case position

* hook at module level

* copy _flat_weights into IpexLSTM # model.bias_ih_l0 will be incorrect

* add fp32 unit test

* refactor LSTM UT

* update comments

Co-authored-by: chunyuan &lt;chunyuan.wu@intel.com&gt;
diff --git a/ideep/ideep/abstract_types.hpp b/ideep/ideep/abstract_types.hpp
@@ -36,6 +36,7 @@ using batch_normalization_flag = dnnl::normalization_flags;
 using query = dnnl::query;
 using scale_t = std::vector<float>;
 using exec_args = std::unordered_map<int, memory>;
+using rnn_direction = dnnl::rnn_direction;
 
 // for computation cache
 using key_t = std::string;
diff --git a/ideep/ideep/operators/lstm.hpp b/ideep/ideep/operators/lstm.hpp
@@ -4,8 +4,53 @@
 namespace ideep {
 
 struct lstm_forward : public dnnl::lstm_forward {
-  static void compute() {
-  }
+   using super = dnnl::lstm_forward;
+
+   static void compute(const tensor& src_layer,
+                      const tensor& src_iter,
+                      const tensor& src_iter_c,
+                      const tensor& weights_layer,
+                      const tensor& weights_iter,
+                      const tensor& bias,
+                      tensor& dst_layer,
+                      tensor& dst_iter,
+                      tensor& dst_iter_c,
+                      const bool reverse = false,
+                      const prop_kind aprop = prop_kind::forward_inference,
+                      const engine& aengine = engine::cpu_engine()) {
+    auto direction = reverse ? rnn_direction::unidirectional_right2left
+                             : rnn_direction::unidirectional_left2right;
+    auto src_layer_desc = src_layer.get_desc();
+    auto src_iter_desc = src_iter.get_desc();
+    auto src_iter_c_desc = src_iter_c.get_desc();
+    // use any format for weights
+    auto weights_layer_desc = weights_layer.get_desc().to_format_any();
+    auto weights_iter_desc = weights_iter.get_desc().to_format_any();
+    auto bias_desc = bias.get_desc();
+    auto dst_layer_desc = dst_layer.get_desc();
+    auto dst_iter_desc = dst_iter.get_desc();
+    auto dst_iter_c_desc = dst_iter_c.get_desc();
+
+    auto pd = primitive_desc(
+        {aprop, direction, src_layer_desc, src_iter_desc, src_iter_c_desc,
+         weights_layer_desc, weights_iter_desc, bias_desc,
+         dst_layer_desc, dst_iter_desc, dst_iter_c_desc},
+        aengine);
+
+    auto expected_weights_layer = weights_layer.reorder_if_differ_in(pd.weights_desc());
+    auto expected_weights_iter = weights_iter.reorder_if_differ_in(pd.weights_iter_desc());
+
+    super(pd).execute(stream::default_stream(),
+                      {{DNNL_ARG_SRC_LAYER, src_layer},
+                       {DNNL_ARG_SRC_ITER, src_iter},
+                       {DNNL_ARG_SRC_ITER_C, src_iter_c},
+                       {DNNL_ARG_WEIGHTS_LAYER, expected_weights_layer},
+                       {DNNL_ARG_WEIGHTS_ITER, expected_weights_iter},
+                       {DNNL_ARG_BIAS, bias},
+                       {DNNL_ARG_DST_LAYER, dst_layer},
+                       {DNNL_ARG_DST_ITER, dst_iter},
+                       {DNNL_ARG_DST_ITER_C, dst_iter_c}});
+   }
 };
 
 struct lstm_backward : public dnnl::lstm_backward {
diff --git a/intel_pytorch_extension_py/ops/__init__.py b/intel_pytorch_extension_py/ops/__init__.py
@@ -1,5 +1,6 @@
 from .roi_align import *
 from .nms import *
+from .lstm import *
 from .interaction import *
 from .embeddingbag import *
 from .jit import *
diff --git a/intel_pytorch_extension_py/ops/lstm.py b/intel_pytorch_extension_py/ops/lstm.py
@@ -0,0 +1,49 @@
+import torch
+from torch.nn.utils.rnn import PackedSequence
+
+# This is a solution to swap the lstm module with the ipex counterpart
+# and will upstream this operator to PyTorch when oneDNN support 
+# bias and src_iter_c in bf16 in bf16 inference. Will keep this
+# for better support of blocked-format weight, e.g. for training.
+
+
+class IpexLSTM(torch.nn.LSTM):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    # port from torch/nn/modules/rnn.py 
+    # replace the _VF.lstm with torch.ops.torch_ipex.lstm when the input is not PackedSequence
+    def forward(self, input, hx=None):  # noqa: F811
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            # fallback to PyTorch LSTM since PackedSequence unsupported in oneDNN
+            return super(IpexLSTM, self).forward(input, hx)
+        else:
+            batch_sizes = None
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+            h_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, real_hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            c_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, self.hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            hx = (h_zeros, c_zeros)
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        result = torch.ops.torch_ipex.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
+                        self.dropout, self.training, self.bidirectional, self.batch_first)
+        output = result[0]
+        hidden = result[1:]
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
diff --git a/intel_pytorch_extension_py/utils.py b/intel_pytorch_extension_py/utils.py
@@ -1,4 +1,8 @@
+import copy
+
 import torch
+
+from .ops.lstm import IpexLSTM
 from .fx import *
 
 def _replace_dropout_with_identity(model):
@@ -11,6 +15,22 @@ def _replace_dropout_with_identity(model):
             else:
                 _replace_dropout_with_identity(child)
 
+def _replace_lstm_with_ipex_lstm(model):
+    # replace lstm with ipex lstm during inference
+    # does not support the case where model itself is torch.nn.LSTM
+    if not model.training:
+        for child_name, child in model.named_children():
+            if isinstance(child, torch.nn.LSTM):
+                assert hasattr(child, "weight_ih_l0"), "torch.nn.LSTM should have weight_ih_l0"
+                ipex_lstm = IpexLSTM(child.input_size, child.hidden_size,
+                    child.num_layers, child.bias, child.batch_first,
+                    child.dropout, child.bidirectional, child.proj_size,
+                    child.weight_ih_l0.device, child.weight_ih_l0.dtype)
+                ipex_lstm.__dict__ = copy.deepcopy(child.__dict__)
+                setattr(model, child_name, ipex_lstm)
+            else:
+                _replace_lstm_with_ipex_lstm(child)
+
 def convert_module_data_type(module, dtype):
     if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
         weight_data = module.weight.detach().clone().to(dtype)
diff --git a/tests/cpu/test_autocast.py b/tests/cpu/test_autocast.py
@@ -5,6 +5,7 @@
 from common_utils import TestCase
 import time, sys
 from torch.testing._core import _get_default_tolerance
+import itertools
 
 def get_rand_seed():
     return int(time.time() * 1000000000)
@@ -212,5 +213,143 @@ def test_embeddingbag_op(self):
         self.assertEqual(traininig_out.dtype, torch.float)
         self.assertEqual(cpu_out, traininig_out)
 
+class M(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, bidirectional, bias, dropout, batch_first):
+        super(M, self).__init__()
+        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, bias=bias, dropout=dropout, batch_first=batch_first)
+
+    def forward(self, x, h = None):
+        x, h = self.lstm(x, h)
+        return x, h
+class TestLSTM(TestCase):
+    def _lstm_params_list(self):
+        params_dict = {
+            "input_size": [1, 2],
+            "hidden_size": [5],
+            "num_layers": [1, 3],
+            "bidirectional": [False, True],
+            "bias": [False, True],
+            "empty_state": [False, True],
+            "batch_first": [False, True],
+            "dropout": [0, 1],
+            "batch_size": [1, 2],
+            "seq_len": [1, 3]
+        }
+
+        params_list = []
+        for key, value in params_dict.items():
+            params_list.append(value)
+        return params_list
+
+    def _cast_dtype(self, input, bf16):
+        if bf16:
+            input = input.to(torch.bfloat16)
+        return input
+
+    def _test_lstm(self, training, bf16, prec = 1e-5):
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        with torch.set_grad_enabled(training):
+            params_list = self._lstm_params_list()
+            for input_size, hidden_size, num_layers, bidirectional, bias, empty_state, batch_first, dropout, batch_size, seq_len in itertools.product(*params_list):
+                # dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1
+                if dropout > 0 and num_layers == 1:
+                    continue
+
+                num_directions = 2 if bidirectional else 1
+                
+                if batch_first:
+                    input = torch.randn(batch_size, seq_len, input_size)
+                else:
+                    input = torch.randn(seq_len, batch_size, input_size)
+                h = torch.randn(num_layers * num_directions, batch_size, hidden_size)
+                c = torch.randn(num_layers * num_directions, batch_size, hidden_size)
+
+                input_ipex = copy.deepcopy(input)
+                h_ipex = copy.deepcopy(h)
+                c_ipex = copy.deepcopy(c)
+
+                model = M(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, bias=bias, dropout=dropout, batch_first=batch_first)
+                model.train() if training else model.eval()
+
+                model_ipex = copy.deepcopy(model)
+                model_ipex.train() if training else model_ipex.eval()
+                ipex.utils._replace_lstm_with_ipex_lstm(model_ipex)
+
+                with ipex.amp.autocast(enabled=bf16, configure=ipex.conf.AmpConf(torch.bfloat16)):
+                    if empty_state:
+                        y, hy = model(self._cast_dtype(input, bf16))
+                        y_ipex, hy_ipex = model_ipex(input)
+                    else:
+                        y, hy = model(input, (self._cast_dtype(h, bf16), self._cast_dtype(c, bf16)))
+                        y_ipex, hy_ipex = model_ipex(input, (h, c))
+
+                if not training and bf16:
+                    self.assertEqual(input_ipex.dtype, torch.float)
+                    self.assertEqual(h_ipex.dtype, torch.float)
+                    self.assertEqual(c_ipex.dtype, torch.float)
+
+                    # with mkldnn LSTM, y, hy[0] is bf16 and hy[1] is fp32
+                    self.assertEqual(y_ipex.dtype, torch.bfloat16)
+                    self.assertEqual(hy_ipex[0].dtype, torch.bfloat16)
+                    self.assertEqual(hy_ipex[1].dtype, torch.float)
+                self.assertEqual(y, y_ipex, prec=prec)
+                self.assertEqual(hy[0], hy_ipex[0], prec=prec)
+
+                self.assertEqual(hy[1], self._cast_dtype(hy_ipex[1], bf16), prec=prec)
+
+    def _test_lstm_pack_padded_sequence(self):
+        embedding_dim = 1024
+        hidden_dim = 10
+        batch_size = 24
+        num_layers = 1
+        bidirectional = True
+        num_direc = 2 if bidirectional else 1
+        max_lens = 96
+
+        sent = torch.randn(batch_size, max_lens, embedding_dim)
+        hid_0 = torch.rand(num_layers * num_direc, batch_size, hidden_dim)
+        hid_1 = torch.randn(num_layers * num_direc, batch_size, hidden_dim)
+
+        sentences = sent.clone().requires_grad_(False)
+        sent_lens = torch.Tensor([1, 2, 3, 4, 5, 1, 3, 2, 96, 5, 3, 1, 1, 2, 1, 2, 3, 6, \
+        1, 2, 4, 6, 2, 1])
+
+        assert sent_lens.shape[0] == batch_size
+        assert sent_lens.max().item() == max_lens
+
+        hidden_0 = hid_0.clone().requires_grad_(False)
+        hidden_1 = hid_1.clone().requires_grad_(False)
+        embeds = torch.nn.utils.rnn.pack_padded_sequence(sentences, sent_lens, batch_first=True, enforce_sorted=False)
+        
+        model = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
+        
+        model_ipex = copy.deepcopy(model)
+        ipex.utils._replace_lstm_with_ipex_lstm(model_ipex)
+
+        lstm_out, hidden_out = model(embeds, (hidden_0, hidden_1))
+        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
+
+        lstm_out_ipex, hidden_out_ipex = model_ipex(embeds, (hidden_0, hidden_1))
+        lstm_out_ipex, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out_ipex, batch_first=True)
+
+        self.assertEqual(lstm_out, lstm_out_ipex)
+        self.assertEqual(hidden_out[0], hidden_out_ipex[0])
+        self.assertEqual(hidden_out[1], hidden_out_ipex[1])
+
+    def test_lstm_inference(self):
+        self._test_lstm(training=False, bf16=False)
+
+        self._test_lstm(training=False, bf16=True, prec=2e-2)
+        
+        self._test_lstm(training=True, bf16=False)
+
+        # TODO: autocast does not support LSTM bf16 training
+        # self._test_lstm(training=True, bf16=True)
+    
+    def test_lstm_pack_padded_sequence(self):
+        self._test_lstm_pack_padded_sequence()
+
 if __name__ == '__main__':
     test = unittest.main()
diff --git a/torch_ipex/csrc/autocast_mode.cpp b/torch_ipex/csrc/autocast_mode.cpp
@@ -195,6 +195,7 @@ MAKE_REGISTER_FUNC(ADD_NS(std), "std", Tensor (const Tensor &, bool), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(std), "std.dim", Tensor (const Tensor &, IntArrayRef, bool, bool), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, bool, double, double, bool), fp32)
 MAKE_REGISTER_FUNC(ADD_NS(grid_sampler), "grid_sampler", Tensor (const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
+MAKE_REGISTER_FUNC(ADD_NS(gather), "gather", Tensor (const Tensor &, int64_t, const Tensor &, bool), fp32)
 
 // promote
 MAKE_REGISTER_FUNC(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.h b/torch_ipex/csrc/cpu/ExtendOPs.h
@@ -3,7 +3,7 @@
 
 #include <ATen/Tensor.h>
 #include <torch/extension.h>
-
+#include "ideep/ideep.hpp"
 
 namespace torch_ipex {
 
@@ -86,14 +86,9 @@ class AtenIpexTypeExt {
                         const at::Tensor& dboxes_xywh,
                         const double scale_xy,
                         const double scale_wh);
+  static std::tuple<at::Tensor, at::Tensor, at::Tensor> lstm(
+      const at::Tensor& input, std::vector<at::Tensor> hx, std::vector<at::Tensor> params, bool has_biases,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first);
 };
 
 }  // namespace torch_ipex
-
-// namespace {
-// static auto dispatch =
-//     torch::RegisterOperators()
-//         // .op("torch_ipex::embedding_bag", &torch_ipex::AtenIpexTypeExt::embedding_bag)
-//         .op("torch_ipex::interaction_forward", &torch_ipex::AtenIpexTypeExt::interaction_forward)
-//         .op("torch_ipex::interaction_backward", &torch_ipex::AtenIpexTypeExt::interaction_backward);
-// }
diff --git a/torch_ipex/csrc/cpu/RNN.cpp b/torch_ipex/csrc/cpu/RNN.cpp