bf16:replace aten:max_pool2d with ipex::max_pool2d for good performance (#7)

XiaobingSuper · web-flow · commit 97d460608f48 · 2021-05-14T09:22:46.000+08:00
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -2,6 +2,7 @@
 #include "torch_ipex/csrc/utils.h"
 #include "Conv.h"
 #include "Linear.h"
+#include "Pooling.h"
 
 #include <ATen/Context.h>
 #include <ATen/InferSize.h>
diff --git a/torch_ipex/csrc/cpu/Pooling.cpp b/torch_ipex/csrc/cpu/Pooling.cpp
@@ -0,0 +1,183 @@
+#include "mkldnn/MKLDNNCommon.h"
+#include "torch_ipex/csrc/utils.h"
+
+#include <ATen/native/Pool.h>
+
+namespace torch_ipex {
+namespace cpu {
+
+inline std::vector<int64_t> expand_param_if_needed(
+    at::IntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  if (list_param.size() == 1) {
+    return std::vector<int64_t>(expected_dim, list_param[0]);
+  } else if ((int64_t)list_param.size() != expected_dim) {
+    std::ostringstream ss;
+    ss << "expected " << param_name << " to be a single integer value or a "
+       << "list of " << expected_dim << " values to match the convolution "
+       << "dimensions, but got " << param_name << "=" << list_param;
+    AT_ERROR(ss.str());
+  } else {
+    return list_param.vec();
+  }
+}
+
+std::vector<int64_t> pool_output_sizes(
+    at::IntArrayRef input_size,
+    at::IntArrayRef kernel_size,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding_l,
+    at::IntArrayRef padding_r,
+    at::IntArrayRef dilation,
+    bool ceil_mode) {
+  std::vector<int64_t> output_size(input_size.size());
+  // copy N and C
+  output_size[0] = input_size[0];
+  output_size[1] = input_size[1];
+
+  for (size_t i = 2; i < input_size.size(); ++i) {
+    output_size[i] = at::native::pooling_output_shape_pad_lr<int64_t>(
+      input_size[i],
+      kernel_size[i - 2],
+      padding_l[i - 2],
+      padding_r[i - 2],
+      stride[i - 2],
+      dilation[i - 2],
+      ceil_mode
+    );
+  }
+
+   return output_size;
+}
+
+static at::Tensor _mkldnn_pooling(
+    const at::Tensor& input,
+    at::IntArrayRef kernel_size,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    bool ceil_mode,
+    ideep::algorithm algo) {
+
+  const int64_t dims = input.dim() - 2;
+  auto kernel_size_vec = expand_param_if_needed(kernel_size, "kernel_size", dims);
+  if (stride.empty()) stride = kernel_size;
+  auto stride_vec = expand_param_if_needed(stride, "stride", dims);
+  auto padding_vec = expand_param_if_needed(padding, "padding", dims);
+  // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
+  auto padding_vec_l = padding_vec;
+  auto padding_vec_r = padding_vec;
+  auto dilation_vec = expand_param_if_needed(dilation, "dilation", dims);
+
+// TODO: the input will be actively converted to channels last format
+// after the 5-D tensor supports channels last format.
+  const ideep::tensor mkldnn_input = at::native::itensor_view_from_dense(input);
+  std::vector<int64_t> output_sizes;
+
+  if (ceil_mode) {
+    // MKLDNN does not support ceil mode, so we adjust padding
+    // on the right side to match behavior. Adjust output size
+    // accordingly.
+    const std::vector<int64_t> output_sizes_ceil = pool_output_sizes(
+        input.sizes(),
+        kernel_size_vec,
+        stride_vec,
+        padding_vec_l,
+        padding_vec_r,
+        dilation_vec,
+        true /* ceil_mode */);
+
+    // adjust padding until output sizes agree
+    bool all_equal = false;
+    while (!all_equal) {
+      output_sizes = pool_output_sizes(
+          input.sizes(),
+          kernel_size_vec,
+          stride_vec,
+          padding_vec_l,
+          padding_vec_r,
+          dilation_vec,
+          false /*ceil_mode */);
+
+      all_equal = true;
+      for (size_t i = 2; i < input.sizes().size(); ++i) {
+        if (output_sizes[i] < output_sizes_ceil[i]) {
+           padding_vec_r[i - 2]++;
+           all_equal = false;
+        }
+      }
+    }
+  } else {
+    output_sizes = pool_output_sizes(
+        input.sizes(),
+        kernel_size_vec,
+        stride_vec,
+        padding_vec_l,
+        padding_vec_r,
+        dilation_vec,
+        false /*ceil_mode */);
+  }
+
+  bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
+  auto output = at::empty({0}, input.options());
+  ideep::tensor mkldnn_output;
+  if (is_channels_last) {
+    output.resize_(output_sizes, input.suggest_memory_format());
+    mkldnn_output = at::native::itensor_view_from_dense(output);
+  }
+
+  auto aprop_kind = ideep::prop_kind::forward;
+  // for max_pool, prop_kind::forward will save indices as workspace for backward use,
+  // for inference, don't need the indices, set aprop_kind to prop_kind::forward_inference
+  // can reduce the memory use.
+  if (ideep::algorithm::pooling_max == algo
+      && !(input.requires_grad() && at::GradMode::is_enabled())) {
+    aprop_kind = ideep::prop_kind::forward_inference;
+  }
+
+  ideep::tensor y;
+  ideep::pooling_forward::compute(
+      mkldnn_input,
+      {output_sizes.cbegin(), output_sizes.cend()},
+      mkldnn_output,
+      {stride_vec.cbegin(), stride_vec.cend()},
+      {kernel_size_vec.cbegin(), kernel_size_vec.cend()},
+      {padding_vec_l.cbegin(), padding_vec_l.cend()},
+      {padding_vec_r.cbegin(), padding_vec_r.cend()},
+      algo,
+      aprop_kind);
+
+  if (is_channels_last) {
+    return output;
+  } else {
+    return at::native::mkldnn_to_dense(
+        at::native::new_with_itensor_mkldnn(std::move(mkldnn_output), optTypeMetaToScalarType(input.options().dtype_opt()),
+                                input.options().device_opt()));
+  }
+}
+
+at::Tensor dil_max_pool2d(
+    const at::Tensor& input,
+    at::IntArrayRef kernel_size,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    bool ceil_mode) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_max_pool2d", std::vector<c10::IValue>({}));
+#endif
+  TORCH_CHECK(std::all_of(dilation.cbegin(), dilation.cend(), [](int64_t i) { return 1 == i; }),
+      "mkldnn_max_pool2d does not support dilation case");
+  return _mkldnn_pooling(
+      IS_CONTIGUOUS_ANY(input) ? input : input.contiguous(),
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      ceil_mode,
+      ideep::algorithm::pooling_max);
+}
+
+}  // namespace cpu
+}  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/Pooling.h b/torch_ipex/csrc/cpu/Pooling.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+#include "ideep/ideep.hpp"
+
+#include <vector>
+
+namespace torch { namespace jit {
+
+namespace ipex {
+  static auto max_pool2d = Symbol::fromQualString("ipex::max_pool2d");
+}
+
+}} //  namespace torch::jit
+
+namespace torch_ipex {
+namespace cpu {
+
+at::Tensor dil_max_pool2d(
+    const at::Tensor& input,
+    at::IntArrayRef kernel_size,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    bool ceil_mode);
+
+}  // namespace cpu
+}  // namespace torch_ipex
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -3,6 +3,7 @@
 #include "graph_rewrite.h"
 
 #include "cpu/FusionOPs.h"
+#include "cpu/Pooling.h"
 
 #include <c10/util/hash.h>
 #include <torch/csrc/jit/runtime/operator.h>
@@ -319,6 +320,9 @@ void FusionPass(std::shared_ptr<Graph> &graph) {
 
   // replace aten conv with ipex conv
   graph_rewrite::replaceAtenConvolutionWithIpexConv(graph);
+
+  // replace aten max_pool2d witj ipex max_pool2d
+  graph_rewrite::replaceAtenMaxPool2dWithIpexMaxPool2d(graph);
   // TODO: Some post processing?? ECS/EDC/Peephole???
   ConstantPropagation(graph);
 }
diff --git a/torch_ipex/csrc/jit/graph_rewrite.cpp b/torch_ipex/csrc/jit/graph_rewrite.cpp
@@ -382,6 +382,20 @@ void replaceAtenConvolutionWithIpexConv(std::shared_ptr<Graph>& graph) {
   rewriter_conv2d.runOnGraph(graph);
  }
 
+void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr<Graph>& graph) {
+  std::string max_pool2d = R"(
+      graph(%a, %kernel_size:int[], %stride:int[], %padding:int[], %dilation:int[], %ceil_mode:bool):
+        %r = aten::max_pool2d(%a, %kernel_size, %stride, %padding, %dilation, %ceil_mode)
+        return (%r) )";
+  std::string ipex_max_pool2d = R"(
+      graph(%a, %kernel_size:int[], %stride:int[], %padding:int[], %dilation:int[], %ceil_mode:bool):
+        %r = ipex::max_pool2d(%a, %kernel_size, %stride, %padding, %dilation, %ceil_mode)
+        return (%r) )";
+  SubgraphRewriter rewriter_max_pool2d;
+  rewriter_max_pool2d.RegisterRewritePattern(max_pool2d, ipex_max_pool2d);
+  rewriter_max_pool2d.runOnGraph(graph);
+}
+
 } // namespace graph_rewrite
 } // namespace jit
 } // namespace torch
diff --git a/torch_ipex/csrc/jit/graph_rewrite.h b/torch_ipex/csrc/jit/graph_rewrite.h
@@ -23,6 +23,7 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph);
 void replaceAtenConvolutionWithIpexConv(std::shared_ptr<Graph>& graph);
 void FuseConvolutionWithEltwise(std::shared_ptr<Graph>& graph);
 void FuseShuffle(std::shared_ptr<Graph>& graph);
+void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr<Graph>& graph);
 
 } // namespace graph_rewrite_helper
 } // namespace jit
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -5,6 +5,7 @@
 
 #include "torch_ipex/csrc/cpu/FusionOPs.h"
 #include "torch_ipex/csrc/utils.h"
+#include "torch_ipex/csrc/cpu/Pooling.h"
 
 namespace torch {
 namespace jit {
@@ -146,6 +147,23 @@ RegisterOperators op({
             };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex::max_pool2d(Tensor input, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode) -> Tensor",
+        [](const Node* node) -> Operation {
+            return [](Stack* stack) {
+              auto result = torch_ipex::cpu::dil_max_pool2d(
+                  (std::move(peek(stack, 0, 6))).toTensor(),
+                  (std::move(peek(stack, 1, 6))).toIntVector(),
+                  (std::move(peek(stack, 2, 6))).toIntVector(),
+                  (std::move(peek(stack, 3, 6))).toIntVector(),
+                  (std::move(peek(stack, 4, 6))).toIntVector(),
+                  (std::move(peek(stack, 5, 6))).toBool());
+              drop(stack, 6);
+              pack(stack, std::move(result));
+              return 0;
+            };
+        },
+        aliasAnalysisFromSchema()),
       });
 } // namespace jit
 } // namespace torch