(1) remove any tag in inner_product (2) add a UT for linear reorder (#158)

jianan-gu · web-flow · commit a986bd3cdd0b · 2021-08-16T12:43:36.000+08:00
* (1) remove any tag in inner_product (2) add a UT for linear reorder

* add inputs for UT test_linear_reorder

* add w/o bias in linear_reorder UT

* fix for format_list string

* format some changes

* commit format minor changes
diff --git a/ideep/ideep/operators/inner_product.hpp b/ideep/ideep/operators/inner_product.hpp
@@ -164,7 +164,6 @@ struct inner_product_forward : public dnnl::inner_product_forward {
       }
     } else {
       op_attr = attr;
-      src_desc = {src.get_dims(), data_type::f32, format_tag::any};
       if (src.has_scale()) {
         auto src_scale = src.get_scale();
         src_scale[0] = 1.f / src_scale[0];
@@ -178,56 +177,50 @@ struct inner_product_forward : public dnnl::inner_product_forward {
       // align weights data type with src
       dst_data_type = src.get_data_type() == data_type::bf16 ? data_type::bf16
                                                              : data_type::f32;
-      src_desc = src.get_desc().to_type(dst_data_type).to_format_any();
-      weights_desc = weights.get_desc().to_type(dst_data_type).to_format_any();
+      src_desc = src.get_desc().to_type(dst_data_type);
+      weights_desc = weights.get_desc().to_type(dst_data_type);
       if (with_bias) {
         IDEEP_ENFORCE(utils::one_of(bias.get_data_type(),
                                     data_type::f32, data_type::bf16),
                       "Incorrect data type in bias");
-        bias_desc = bias.get_desc().to_format_any();
+        bias_desc = bias.get_desc();
       }
     }
 
-    tensor::desc dst_desc(dst_dims, dst_data_type, format_tag::any);
+    tensor::desc dst_desc = dst.get_desc().to_type(dst_data_type);
     auto pd = with_bias
        ? primitive_desc({aprop_kind, src_desc, weights_desc, bias_desc,
                          dst_desc}, op_attr, aengine)
        : primitive_desc({aprop_kind, src_desc, weights_desc, dst_desc},
                         op_attr, aengine);
 
-    auto expected_src = src.reorder_if_differ_in(pd.src_desc(), src_attr);
-    auto expected_weights = weights.reorder_if_differ_in(pd.weights_desc(), weights_attr);
     // [ Note output buffer ]
     // In this case, dst is an empty ideep tensor, can be re-init
     // If dst is not empty, ideep must write result to dst's memory and it is caller's duty to
     // make sure dst is big enough to hold the result
     if (dst.is_empty()) {
       dst.init(pd.dst_desc());
     }
-    auto expected_dst = dst.reorder_if_differ_in(pd.dst_desc());
-    if (!dst_scales.empty() && utils::one_of(dst.get_data_type(), data_type::u8, data_type::s8)) {  
-      expected_dst.set_scale(dst_scales_in);
+
+    if (!dst_scales.empty() &&
+        utils::one_of(dst.get_data_type(), data_type::u8, data_type::s8)) {
+      dst.set_scale(dst_scales_in);
     }
 
     if (with_bias){
-      auto expected_bias = bias.reorder_if_differ_in(pd.bias_desc(), bias_attr);
-      super(pd).execute(stream::default_stream(),
-                        {{DNNL_ARG_SRC, expected_src},
-                         {DNNL_ARG_WEIGHTS, expected_weights},
-                         {DNNL_ARG_BIAS, expected_bias},
-                         {DNNL_ARG_DST, expected_dst}});
+      super(pd).execute(stream::default_stream(), {{DNNL_ARG_SRC, src},
+                                                   {DNNL_ARG_WEIGHTS, weights},
+                                                   {DNNL_ARG_BIAS, bias},
+                                                   {DNNL_ARG_DST, dst}});
     } else {
-      super(pd).execute(stream::default_stream(),
-                        {{DNNL_ARG_SRC, expected_src},
-                         {DNNL_ARG_WEIGHTS, expected_weights},
-                         {DNNL_ARG_DST, expected_dst}});
+      super(pd).execute(stream::default_stream(), {{DNNL_ARG_SRC, src},
+                                                   {DNNL_ARG_WEIGHTS, weights},
+                                                   {DNNL_ARG_DST, dst}});
     }
 
-    if (attr.non_negitive_output() && expected_dst.get_data_type() == data_type::s8) {
-      expected_dst.to_type(data_type::u8);
+    if (attr.non_negitive_output() && dst.get_data_type() == data_type::s8) {
+      dst.to_type(data_type::u8);
     }
-    // reorder back to dst's buffer if needed
-    expected_dst.reorder_to_if_differ_from(dst);
   }
 };
 
@@ -242,11 +235,6 @@ struct inner_product_backward_data : public dnnl::inner_product_backward_data {
                       tensor& diff_src,
                       const engine& aengine = engine::cpu_engine()) {
     auto weights_ = weights;
-    if (diff_dst.get_data_type() == data_type::bf16) {
-      weights_.init(weights.get_desc().to_type(data_type::bf16));
-      weights_.reorder_from(weights);
-    }
-
     // workaround: diff_src and weights from caffe2 may have different dims.
     // It would be better for caffe2 to do this reshape anyway.
     if (diff_src_dims.size() != weights.ndims()) {
@@ -255,10 +243,9 @@ struct inner_product_backward_data : public dnnl::inner_product_backward_data {
       weights_.reshape(new_dims);
     }
 
-    auto diff_dst_desc = diff_dst.get_desc().to_format_any();
-    auto weights_desc = weights_.get_desc().to_format_any();
-    auto diff_src_desc =
-        tensor::desc(diff_src_dims, diff_dst.get_data_type(), tag::any);
+    auto diff_dst_desc = diff_dst.get_desc();
+    auto weights_desc = weights_.get_desc();
+    auto diff_src_desc = diff_src.get_desc().to_type(diff_dst.get_data_type());
 
     auto forward_hints =
         inner_product_forward::primitive_desc(
@@ -268,8 +255,6 @@ struct inner_product_backward_data : public dnnl::inner_product_backward_data {
     auto pd = primitive_desc(
         {diff_src_desc, weights_desc, diff_dst_desc}, aengine, forward_hints);
 
-    auto expected_diff_dst = diff_dst.reorder_if_differ_in(pd.diff_dst_desc());
-    auto expected_weights = weights_.reorder_if_differ_in(pd.weights_desc());
     // diff_src's origin content are not used, so it can be re-init directly
     // It's caller's duty to make sure diff_src's buffer size is same with it actually needed
     // Here we dose not support to write to given strided buffer since we know the grad is always contiguous
@@ -280,8 +265,8 @@ struct inner_product_backward_data : public dnnl::inner_product_backward_data {
     }
 
     super(pd).execute(stream::default_stream(),
-                      {{DNNL_ARG_DIFF_DST, expected_diff_dst},
-                       {DNNL_ARG_WEIGHTS, expected_weights},
+                      {{DNNL_ARG_DIFF_DST, diff_dst},
+                       {DNNL_ARG_WEIGHTS, weights_},
                        {DNNL_ARG_DIFF_SRC, diff_src}});
   }
 };
@@ -319,18 +304,17 @@ struct inner_product_backward_weights
                            tensor& diff_bias,
                            const data_type diff_weight_type,
                            const engine& aengine = engine::cpu_engine()) {
-    auto src_desc = src.get_desc().to_format_any();
-    auto diff_dst_desc = diff_dst.get_desc().to_format_any();
+    auto src_desc = src.get_desc();
+    auto diff_dst_desc = diff_dst.get_desc();
     auto diff_weights_dims = src.get_dims();
     diff_weights_dims[0] = diff_dst.get_dim(1);
     data_type diff_dst_type = diff_dst.get_data_type();
     data_type diff_weight_type_in = data_type::undef== diff_weight_type ?
                                     diff_dst_type : diff_weight_type;
-    auto diff_weights_desc =
-        tensor::desc(diff_weights_dims, diff_weight_type_in, tag::any);
 
-    auto diff_bias_desc =
-        tensor::desc({diff_dst.get_dim(1)}, diff_weight_type_in, tag::any);
+    auto diff_weights_desc =
+        diff_weights.get_desc().to_type(diff_weight_type_in);
+    auto diff_bias_desc = diff_bias.get_desc().to_type(diff_weight_type_in);
 
     // for forward hint, weights_desc should have same data_type
     // with other input desc, expect for bias_desc
@@ -349,18 +333,13 @@ struct inner_product_backward_weights
         : primitive_desc({src_desc, diff_weights_desc, diff_dst_desc},
                           aengine, forward_hints);
 
-    auto expected_diff_dst = diff_dst.reorder_if_differ_in(pd.diff_dst_desc());
-    auto expected_src = src.reorder_if_differ_in(pd.src_desc());
     if (diff_weights.is_empty()){
       diff_weights.init(pd.diff_weights_desc());
     }
-    // Here we need to write to given strided buffer, so if given buffer is different with the best format
-    // We need to firstly init a new buffer to store the output, and copy the output to a given buffer
-    tensor expected_diff_weights = diff_weights.get_desc() == pd.diff_weights_desc() ? diff_weights : tensor(pd.diff_weights_desc());
 
-    exec_args args {{DNNL_ARG_DIFF_DST, expected_diff_dst},
-                    {DNNL_ARG_SRC, expected_src},
-                    {DNNL_ARG_DIFF_WEIGHTS ,expected_diff_weights}};
+    exec_args args{{DNNL_ARG_DIFF_DST, diff_dst},
+                   {DNNL_ARG_SRC, src},
+                   {DNNL_ARG_DIFF_WEIGHTS, diff_weights}};
 
     if (with_diff_bias) {
       if (diff_bias.is_empty()){
@@ -373,7 +352,6 @@ struct inner_product_backward_weights
     }
 
     super(pd).execute(stream::default_stream(), args);
-    expected_diff_weights.reorder_to_if_differ_from(diff_weights);
   }
 };
 
diff --git a/tests/cpu/common_utils.py b/tests/cpu/common_utils.py
@@ -1065,29 +1065,59 @@ def get_reorder_info(self, line):
         assert self.is_dnnl_reorder(line)
         tokens = line.split(',')
         src_desc, dst_desc = tokens[6].split(' ')
-        src_dtype = src_desc.split('::')[0].split('-')
+        src_dtype = src_desc.split('::')[0].split('_')
         src_format = src_desc.split('::')[1]
-        dst_dtype = dst_desc.split('::')[0].split('-')
+        dst_dtype = dst_desc.split('::')[0].split('_')
         dst_format = dst_desc.split('::')[1]
         return src_dtype, src_format, dst_dtype, dst_format
 
+    def isPlainFormat(self, check_format):
+        format_index = 0
+        format = ""
+        for check in check_format.split(':'):
+            if check == "blocked":
+                break
+            format_index = format_index+1
+        format = check_format.split(':')[format_index+1]
+        # ref to https://spec.oneapi.io/versions/latest/elements/oneDNN/source/data_model/memory/formats.html#
+        format_list=["a",
+                     "ab","ba",
+                     "acb","abc","bac","cba","bca",
+                     "abcd","abdc","acdb","bacd","bcda","cdba","dcab",
+                     "abcde","abdec","acbde","acdeb","bacde","bcdea","cdeba","decab",
+                     "abcdef","acbdef","defcab"]
+        for f in format_list:
+            if f == format:
+                return True
+        return False
+
+    def RedundantReorder(self, line):
+        if not self.is_dnnl_reorder(line):
+            return False
+        src_dtype, src_format, dst_dtype, dst_format = self.get_reorder_info(line)
+        return src_dtype[1] == dst_dtype[1] and src_format == dst_format
+
     def ReorderForPack(self, line):
         if not self.is_dnnl_reorder(line):
             return False
         src_dtype, src_format, dst_dtype, dst_format = self.get_reorder_info(line)
-        return src_dtype == dst_dtype
+        if self.isPlainFormat(src_format) and self.isPlainFormat(dst_format): # for prepack, at least dst should be blocked format and not in the format list
+            return False
+        return src_dtype[1] == dst_dtype[1]
 
     def OnlyReorderDtype(self, line):
         if not self.is_dnnl_reorder(line):
             return False
         src_dtype, src_format, dst_dtype, dst_format = self.get_reorder_info(line)
-        return src_dtype != dst_dtype and src_format == dst_dtype
+        return src_dtype[1] != dst_dtype[1] and src_format == dst_format
         
     def OnlyReorderFormat(self, line):
         if not self.is_dnnl_reorder(line):
             return False
         src_dtype, src_format, dst_dtype, dst_format = self.get_reorder_info(line)
-        return src_dtype == dst_dtype and src_format != dst_dtype
+        if self.isPlainFormat(src_format) and not self.isPlainFormat(dst_format): # reorder from plain format to blocked, should be prepack reorder
+            return False
+        return src_dtype[1] == dst_dtype[1]  and src_format != dst_format
 
     def assertOnlyReorderDtype(self, line):
         assert OnlyReorderDtype(line), 'the verbose msg shows not only reorder dtype'
diff --git a/tests/cpu/linear_reorder.py b/tests/cpu/linear_reorder.py
@@ -0,0 +1,48 @@
+import torch
+import intel_pytorch_extension as ipex
+import torch.nn as nn
+import itertools
+
+class Model(nn.Module):
+    def __init__(self, ic, oc, bias):
+        super(Model, self).__init__()
+        self.linear = nn.Linear(ic, oc, bias=bias)
+
+    def forward(self, input):
+        return self.linear(input)
+
+def run_model(dtype=None):
+    out_feature = [1024, 256, 1, torch.randint(3, 10, (1, )).item()]
+    in_feature = [128, 479, torch.randint(3, 10, (1, )).item()]
+    input_shapes=[]
+    for s in in_feature:
+        input_shapes += [(128, s), (2, 64, s), (2, 2, 32, s)]
+    options = itertools.product(out_feature, [True, False], input_shapes)
+    for out_features, bias, x_shape in options:
+        in_features = x_shape[-1]
+        x = torch.randn(x_shape, dtype=torch.float32).requires_grad_()
+        model = Model(in_features, out_features, bias)
+        optimizer = torch.optim.Adagrad(model.parameters(), lr=0.1)
+        if dtype == 0 :
+            conf = ipex.AmpConf(torch.float32)
+            model, optimizer = ipex.optimize(model, dtype=torch.float32, optimizer=optimizer, level='O1')
+            with ipex.amp.autocast(enabled=True, configure=conf):
+                run_mod = model.forward(x).sum()
+        elif dtype == 1 :
+            conf = ipex.AmpConf(torch.bfloat16)
+            model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer, level='O1')
+            with ipex.amp.autocast(enabled=True, configure=conf):
+                run_mod = model.forward(x).sum()
+        else: # reserved
+            pass
+        optimizer.zero_grad()
+        run_mod.backward()
+        optimizer.step()
+    
+
+if __name__ == "__main__":
+    print(f"fp32, {'*' * 50}")
+    run_model(0)
+
+    print(f"bf16, {'*' * 50}")
+    run_model(1)
diff --git a/tests/cpu/test_linear_reorder.py b/tests/cpu/test_linear_reorder.py
@@ -0,0 +1,36 @@
+import unittest
+from common_utils import VerboseTestCase
+import subprocess
+class TestLinearReorder(VerboseTestCase):
+    def test_linear_reorder(self):
+        with subprocess.Popen('DNNL_VERBOSE=1 python -u linear_reorder.py', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+            segmentation = {
+                'fp32': {'reorder_for_pack': 2, 'reorder_for_dtype': 0, 'reorder_for_format': 0, 'redundent_reorder' : 0,},
+                'bf16': {'reorder_for_pack': 3, 'reorder_for_dtype': 0, 'reorder_for_format': 0, 'redundent_reorder' : 0,},
+            } # there should be only reorders on prepack, if any other reorder appears, will cause fail
+            seg = None
+            for line in p.stdout.readlines():
+                line = str(line, 'utf-8').strip()
+                if line.endswith('***************'):
+                    seg = line.strip().split(',')[0]
+                    continue
+                # Following is to check if there is the reorder number is as excepted
+                if self.is_dnnl_verbose(line) and self.ReorderForPack(line):
+                    segmentation[seg]['reorder_for_pack'] -= 1
+                    self.assertTrue(segmentation[seg]['reorder_for_pack'] >=0, "show unexpected reorder for pack")
+
+                if self.is_dnnl_verbose(line) and self.OnlyReorderDtype(line):
+                    segmentation[seg]['reorder_for_dtype'] -= 1
+                    self.assertTrue(segmentation[seg]['reorder_for_dtype'] >=0, "show unexpected reorder for dtype")
+                    
+                if self.is_dnnl_verbose(line) and self.OnlyReorderFormat(line):
+                    segmentation[seg]['reorder_for_format'] -= 1
+                    self.assertTrue(segmentation[seg]['reorder_for_format'] >=0, "show unexpected reorder for format")
+
+                if self.is_dnnl_verbose(line) and self.RedundantReorder(line):
+                    segmentation[seg]['redundent_reorder'] -= 1
+                    self.assertTrue(segmentation[seg]['redundent_reorder'] >=0, "show unexpected redundent reorder")
+
+ 
+if __name__ == '__main__':
+    test = unittest.main()
diff --git a/torch_ipex/csrc/cpu/Linear.cpp b/torch_ipex/csrc/cpu/Linear.cpp
@@ -155,8 +155,23 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> linear_backward_impl(
   at::Tensor grad_input, grad_weight, grad_bias;
   // weight's desc is needed for both bw_d and bw_w
   const ideep::tensor w = get_linear_prepacked_weight(weight, out_features, in_features);
-  auto input_reshaped = input.dim() > 2 ? input.reshape({-1, input.size(input.dim() - 1)}) : input;
-  auto grad_output_reshaped = grad_output.dim() > 2 ? grad_output.reshape({-1, grad_output.size(grad_output.dim() - 1)}) : grad_output;
+  // for IP, currently both stag=ab and dtag=ab are only supported by onednn, we
+  // need first make both src and diff_dst contiguous if the input or
+  // grad_output is not expected
+  auto input_contiguous = input.is_contiguous() ? input : input.contiguous();
+  auto input_reshaped =
+      input_contiguous.dim() > 2
+          ? input_contiguous.reshape(
+                {-1, input_contiguous.size(input_contiguous.dim() - 1)})
+          : input_contiguous;
+  auto grad_output_contiguous =
+      grad_output.is_contiguous() ? grad_output : grad_output.contiguous();
+  auto grad_output_reshaped =
+      grad_output_contiguous.dim() > 2
+          ? grad_output_contiguous.reshape(
+                {-1,
+                 grad_output_contiguous.size(grad_output_contiguous.dim() - 1)})
+          : grad_output_contiguous;
   const ideep::tensor grady = itensor_view_from_dense(grad_output_reshaped);
   if (output_mask[0]) {
     at::Tensor grad_input_reshaped = at::empty_like(input_reshaped);
@@ -166,7 +181,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> linear_backward_impl(
     ideep::inner_product_backward_data::compute(
       grady, w, input_reshaped.sizes().vec(), gradx
     );
-    grad_input = input.dim() > 2 ? grad_input_reshaped.reshape(input.sizes().vec()) : grad_input_reshaped;
+    grad_input =
+        input_contiguous.dim() > 2
+            ? grad_input_reshaped.reshape(input_contiguous.sizes().vec())
+            : grad_input_reshaped;
   }
   if (output_mask[1] || output_mask[2]) {
   //bw_w