Fuse shuffle (#183)

EikanWang · web-flow · commit 060ea586385c · 2021-09-02T10:28:22.000+08:00
* Fuse the shuffle pattern and leverage oneDNN implementation because PyTorch does not support shuffle primitive.
* Fix code style issues. Formatted by clang-format
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -872,6 +872,12 @@ def test_output_linear_gelu(self):
             prec=5e-3,
             levels=['O0'])
 
+    def test_channel_shuffle(self):
+        self._test_output(
+            ChannelShuffle(10, 16, 50, 50, 4),
+            torch.rand(10, 16, 50, 50),
+            kind_in_graph="ipex::shuffle_2d")
+
     def test_jit_function(self):
         # test hool trace and script can works for function
         def fn(input, weight, bias):
diff --git a/torch_ipex/csrc/cpu/CustomOPs.cpp b/torch_ipex/csrc/cpu/CustomOPs.cpp
@@ -79,16 +79,16 @@ at::Tensor AtenIpexJITDev::dil_convolution_sigmoid(
 }
 
 /**
- * Dispatch at::matmul + at::div pattern to ipex for jit inference, but only one-element 
- * tensor and channel dim boadcast is enabled in oneDNN 2.2.0 now. So, for simplicity,this path is just 
- * a fallback path now.
- * output(out) = (tensor1 * tensor2).div(div_input)
- *  
- * @param tensor1 
- * @param tensor2 
- * @param out Optinal output provided by user for matmul 
- * @param div_input Input Tensor for div 
- * @return Value for the fusion pattern output. 
+ * Dispatch at::matmul + at::div pattern to ipex for jit inference, but only
+ * one-element tensor and channel dim boadcast is enabled in oneDNN 2.2.0 now.
+ * So, for simplicity,this path is just a fallback path now. output(out) =
+ * (tensor1 * tensor2).div(div_input)
+ *
+ * @param tensor1
+ * @param tensor2
+ * @param out Optinal output provided by user for matmul
+ * @param div_input Input Tensor for div
+ * @return Value for the fusion pattern output.
  */
 at::Tensor  AtenIpexJITDev::dil_matmul_div(
     const at::Tensor& tensor1,
@@ -101,19 +101,18 @@ at::Tensor  AtenIpexJITDev::dil_matmul_div(
   if (out.defined()) {
     at::matmul_out(out, tensor1, tensor2);
     return out.div(div_input);
-  } 
+  }
   auto output = at::matmul(tensor1, tensor2);
   return output.div(div_input);
-      
- 
 }
 
 /**
- *Dispatch at::matmul + at::div pattern to ipex for jit inference, but only bmm with same shape for 
- *tensor1 and tensor2 and scalar input for div will be dispatched to oneDNN kernel. Otherwise will fallback.
- *For oneDNN kernel, scalar input will be used as the scale attribute for matmul primitive.
+ *Dispatch at::matmul + at::div pattern to ipex for jit inference, but only bmm
+ *with same shape for tensor1 and tensor2 and scalar input for div will be
+ *dispatched to oneDNN kernel. Otherwise will fallback. For oneDNN kernel,
+ *scalar input will be used as the scale attribute for matmul primitive.
  *output(out) = (tensor1 * tensor2).div(div_input_scalar).
- *ToDo: matmul + div scalar for matmul with other shape  
+ *ToDo: matmul + div scalar for matmul with other shape
  *
  *@param tensor1
  *@param tensor2
@@ -131,8 +130,8 @@ at::Tensor  AtenIpexJITDev::dil_matmul_div(
 #endif
   auto dim_tensor1 = tensor1.dim();
   auto dim_tensor2 = tensor2.dim();
-  if (dim_tensor1 == dim_tensor2 && dim_tensor1 >= 3) { 
-    float scale = 1.0 / div_input.to<float>(); 
+  if (dim_tensor1 == dim_tensor2 && dim_tensor1 >= 3) {
+    float scale = 1.0f / div_input.to<float>();
     return bmm_impl(tensor1, tensor2, out, ideep::attr_t(), scale);
   } else {
     return AtenIpexJITDev::dil_matmul_div(tensor1, tensor2, out, at::native::wrapped_scalar_tensor(div_input));
@@ -309,26 +308,24 @@ at::Tensor AtenIpexJITDev::dil_linear_fuse_eltwise(
   return linear_impl(self, weight, bias, attr);
 }
 
-
 /**
  *Dispatch Linear + Add fusion pattern to ipex oneDNN kernel for inference mode.
  *This feature might improve performance for cases like residual learning blocks
- *Pattern: accum = accum * alpha + Linear(self, weight, bias) 
+ *Pattern: accum = accum * alpha + Linear(self, weight, bias)
  *
- *@param self Activatin input for Linear  
+ *@param self Activatin input for Linear
  *@param weight Weight for Linear
  *@param bias Bias for Linear
  *@param accum One input for add operation, another is the output of Linear
- *@param alpha Scale for accum when doing add operation. 
+ *@param alpha Scale for accum when doing add operation.
  *
- *@return Value for the fusion pattern output. 
+ *@return Value for the fusion pattern output.
  */
-at::Tensor AtenIpexJITDev::dil_linear_add(
-    const at::Tensor& self, 
-    const at::Tensor& weight, 
-    const at::Tensor& bias, 
-    at::Tensor& accumu, 
-    at::Scalar alpha) {
+at::Tensor AtenIpexJITDev::dil_linear_add(const at::Tensor &self,
+                                          const at::Tensor &weight,
+                                          const at::Tensor &bias,
+                                          at::Tensor &accumu,
+                                          at::Scalar alpha) {
 #if defined(IPEX_PROFILE_OP)
   RECORD_FUNCTION("AtenIpexJITDev::dil_linear_add", std::vector<c10::IValue>({}));
 #endif
@@ -468,5 +465,17 @@ at::Tensor AtenIpexJITDev::dil_layernorm(
       at::native_layer_norm(input, normalized_shape, weight, bias, eps));
 }
 
+at::Tensor AtenIpexJITDev::dil_shuffle(const at::Tensor &self,
+                                       at::IntArrayRef view_shape, int64_t dim0,
+                                       int64_t dim1) {
+  ideep::tensor _self = itensor_view_from_dense(self);
+  auto group_dim = dim0 < dim1 ? dim0 : dim1;
+  auto groups = view_shape[group_dim];
+  auto output = at::empty_like(self);
+  ideep::tensor _output = itensor_view_from_dense(output);
+  ideep::channel_shuffle_forward::compute(_self, _output, groups, group_dim);
+  return output;
+}
+
 }  // namespace cpu
 }  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -194,6 +194,10 @@ class AtenIpexJITDev {
        bool weight_channels_last, bool weight_prepacked, at::Tensor &accumu,
        at::Scalar alpha);
 
+   static at::Tensor dil_shuffle(const at::Tensor &self,
+                                 at::IntArrayRef view_shape, int64_t dim0,
+                                 int64_t dim1);
+
    // int8 op
    static at::Tensor dil_qembeddingbag(const at::Tensor weight,
                                        const at::Tensor indices,
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp