From 89541bcbe768220403a9671476fa199d4dc6e50d Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Tue, 19 May 2020 12:41:57 +0800
Subject: [PATCH 01/10] jit: enable conv_relu fusion

---
 cmake/CPU.cmake                               |   4 +-
 tests/cpu/test_jit.py                         | 111 ++++++++++++++++++
 torch_ipex/csrc/auto_opt_config.h             |   9 ++
 torch_ipex/csrc/cpu/DevOPs.cpp                |  35 ++++++
 torch_ipex/csrc/cpu/DevOPs.h                  |   4 +
 torch_ipex/csrc/cpu/dbl/Conv.cpp              |  15 ++-
 torch_ipex/csrc/cpu/dbl/Conv.h                |   3 +-
 torch_ipex/csrc/init_python_bindings.cpp      |  39 +++++-
 torch_ipex/csrc/jit/CMakeLists.txt            |  10 ++
 torch_ipex/csrc/jit/accelerated_ops.h         |   2 +-
 torch_ipex/csrc/jit/dnnl_ops.h                |   2 +-
 torch_ipex/csrc/jit/fusion_pass.cpp           |  18 +--
 torch_ipex/csrc/jit/graph_ext.cpp             |  14 ++-
 torch_ipex/csrc/jit/graph_ext.h               |  22 ++--
 torch_ipex/csrc/jit/init.cpp                  |  21 ++--
 torch_ipex/csrc/jit/op_rewrite.cpp            |  17 ++-
 torch_ipex/csrc/jit/op_rewrite.h              |   2 +-
 torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp |  52 +++++---
 18 files changed, 321 insertions(+), 59 deletions(-)
 create mode 100644 tests/cpu/test_jit.py
 create mode 100644 torch_ipex/csrc/jit/CMakeLists.txt

diff --git a/cmake/CPU.cmake b/cmake/CPU.cmake
index 72693d419..5d57ccfb9 100644
--- a/cmake/CPU.cmake
+++ b/cmake/CPU.cmake
@@ -136,9 +136,11 @@ include_directories(${DPCPP_THIRD_PARTY_ROOT}/xsmm/include)
 set(DPCPP_SRCS)
 set(DPCPP_COMMON_SRCS)
 set(DPCPP_CPU_SRCS)
+set(DPCPP_JIT_SRCS)
 
 add_subdirectory(${DPCPP_ROOT})
 add_subdirectory(${DPCPP_ROOT}/cpu)
+add_subdirectory(${DPCPP_ROOT}/jit)
 
 # libxsmm
 include(${CMAKE_ROOT}/Modules/ExternalProject.cmake)
@@ -153,7 +155,7 @@ ExternalProject_Add(xsmm
   INSTALL_COMMAND ""
   )
 # Compile code with pybind11
-set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS})
+set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS} ${DPCPP_JIT_SRCS})
 pybind11_add_module(${PLUGIN_NAME} SHARED ${DPCPP_SRCS})
 target_link_libraries(${PLUGIN_NAME} PRIVATE ${DPCPP_THIRD_PARTY_ROOT}/xsmm/lib/libxsmm.a)
 
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
new file mode 100644
index 000000000..42142cdc8
--- /dev/null
+++ b/tests/cpu/test_jit.py
@@ -0,0 +1,111 @@
+from __future__ import division
+from __future__ import print_function
+
+'''
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+'''
+
+"""Tests for rn50."""
+
+import math
+import random
+import unittest
+from functools import reduce
+
+import torch
+import torch.nn as nn
+import copy
+
+import intel_pytorch_extension
+from intel_pytorch_extension import core
+
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.nn import Parameter
+import torch.nn.functional as F
+from torch.autograd import gradcheck
+from torch.autograd.gradcheck import gradgradcheck
+from torch._six import inf, nan
+
+from common_utils import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
+    TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, do_test_dtypes, do_test_empty_full, \
+    IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \
+    skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf
+
+device = 'dpcpp:0'
+#device = 'cpu:0'
+SIZE = 100
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+
+class Conv_relu(nn.Module):
+    def __init__(self):
+        super(Conv_relu, self).__init__()
+        torch.manual_seed(2018)
+        self.conv = torch.nn.Conv2d(20, 20, 5)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x.relu()
+
+class TestJITOP(TestCase):
+    def test_conv_relu_fusion(self):
+        x = torch.randn(1, 20, 20, 20).to('dpcpp')
+
+        model = Conv_relu().to('dpcpp').eval()
+
+        with torch.no_grad():
+            core.disable_jit()
+            y1 = model(x)
+            core.enable_jit()
+            script_model =  torch.jit.script(model)
+            y2 = script_model(x)
+        self.assertEqual(y1, y2)
+
+if __name__ == '__main__':
+    core.enable_auto_dnnl()
+    test = unittest.main()
diff --git a/torch_ipex/csrc/auto_opt_config.h b/torch_ipex/csrc/auto_opt_config.h
index 333a0adfd..2f950edef 100644
--- a/torch_ipex/csrc/auto_opt_config.h
+++ b/torch_ipex/csrc/auto_opt_config.h
@@ -17,6 +17,14 @@ class AutoOptConfig {
     return auto_dnnl_;
   }
 
+  inline void set_jit_fuse(bool jit_fuse) {
+    jit_fuse_ = jit_fuse;
+  }
+
+  inline bool get_jit_fuse() {
+    return jit_fuse_;
+  }
+
   inline void set_mix_bf16_fp32(bool value) {
     mix_bf16_fp32_ = value;
   }
@@ -39,6 +47,7 @@ class AutoOptConfig {
 
 private:
   bool auto_dnnl_;
+  bool jit_fuse_;
   bool mix_bf16_fp32_;
   bool pure_bf16_;
 };
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
index ebe231f42..a162d7863 100644
--- a/torch_ipex/csrc/cpu/DevOPs.cpp
+++ b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -67,6 +67,41 @@ at::Tensor AtenIpexCPUDev::dil_convolution(
   return dbl::comm::gen_aten_tensor_by(std::move(dil_output));
 }
 
+at::Tensor AtenIpexCPUDev::dil_convolution_relu(
+    const at::Tensor & input,
+    const at::Tensor & weight,
+    const at::Tensor & bias,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    int64_t groups) {
+  DEBUG("AtenIpexCPUDev::dil_convolution\n");
+  dil::tensor dil_input;
+  dil::tensor dil_weight;
+  c10::optional<dil::tensor> dil_bias{c10::nullopt};
+
+  CHECK_DNNL_OP_PRE_COND(input);
+  CHECK_DNNL_OP_PRE_COND(weight);
+  dil_input = dbl::comm::try_gen_dil_tensor(input);
+  dil_weight = dbl::comm::try_gen_dil_tensor(weight);
+  if (bias.defined()) {
+    CHECK_DNNL_OP_PRE_COND(bias);
+    dil_bias = dbl::comm::try_gen_dil_tensor(bias);
+  }
+
+  dil::tensor dil_output = dbl::conv::conv2d_impl(
+    dil_input,
+    dil_weight,
+    dil_bias,
+    padding,
+    stride,
+    dilation,
+    groups,
+    dil::attr_t::fuse_relu());
+
+  return dbl::comm::gen_aten_tensor_by(dil_output);
+}
+
 at::Tensor dil_convolution_backward_input(
     at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool bias_defined)
diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h
index 7c76873e6..856bee0a7 100644
--- a/torch_ipex/csrc/cpu/DevOPs.h
+++ b/torch_ipex/csrc/cpu/DevOPs.h
@@ -69,6 +69,10 @@ class AtenIpexCPUDev {
   static at::Tensor dil_cat(at::TensorList tensors, int64_t dim);
   static std::vector<at::Tensor> dil_split_with_sizes(const at::Tensor& self, at::IntArrayRef split_sizes, int64_t dim);
   static std::vector<at::Tensor> dil_split(const at::Tensor& self, int64_t split_size, int64_t dim);
+
+  // for JIT ops
+  static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
+
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/dbl/Conv.cpp b/torch_ipex/csrc/cpu/dbl/Conv.cpp
index b8576e669..9dadfdebd 100644
--- a/torch_ipex/csrc/cpu/dbl/Conv.cpp
+++ b/torch_ipex/csrc/cpu/dbl/Conv.cpp
@@ -31,7 +31,8 @@ dil::tensor conv2d_impl(
     at::IntArrayRef padding,
     at::IntArrayRef stride,
     at::IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    const dil::attr_t& attr) {
   std::vector<int64_t> kernel_size(x.ndims());
   // mkldnn conv2d weights could have been re-ordered to 5d by
   // mkldnn_reorder_conv2d_weight
@@ -61,7 +62,11 @@ dil::tensor conv2d_impl(
       {dilation.begin(), dilation.end()},
       {padding.begin(), padding.end()},
       {padding.begin(), padding.end()},
-      groups);
+      groups,
+      dil::scale_t(),
+      dil::scale_t(),
+      dil::scale_t(),
+      attr);
   } else {
     dil::convolution_forward::compute(
       x,
@@ -72,7 +77,11 @@ dil::tensor conv2d_impl(
       {dilation.begin(), dilation.end()},
       {padding.begin(), padding.end()},
       {padding.begin(), padding.end()},
-      groups);
+      groups,
+      dil::scale_t(),
+      dil::scale_t(),
+      dil::scale_t(),
+      attr);
   }
   return y;
 }
diff --git a/torch_ipex/csrc/cpu/dbl/Conv.h b/torch_ipex/csrc/cpu/dbl/Conv.h
index 224551ca4..e4d41aa33 100644
--- a/torch_ipex/csrc/cpu/dbl/Conv.h
+++ b/torch_ipex/csrc/cpu/dbl/Conv.h
@@ -25,7 +25,8 @@ dil::tensor conv2d_impl(
     at::IntArrayRef padding,
     at::IntArrayRef stride,
     at::IntArrayRef dilation,
-    int64_t groups);
+    int64_t groups,
+    const dil::attr_t& attr = dil::attr_t());
 
 }  // namespace conv
 }  // namespace dbl
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
index b50eca837..4ffe64246 100644
--- a/torch_ipex/csrc/init_python_bindings.cpp
+++ b/torch_ipex/csrc/init_python_bindings.cpp
@@ -5,6 +5,13 @@
 #include <c10/util/Optional.h>
 #include <torch/csrc/utils/pybind.h>
 
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/operator_options.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include "jit/fusion_pass.h"
+#include "jit/op_rewrite.h"
+
 #include <cstring>
 #include <sstream>
 #include <string>
@@ -128,15 +135,41 @@ void InitIpexModuleBindings(py::module m) {
   m.def("mlp_create_handle", &AtenIpexTypeMLPExt::create_handle);
   m.def("mlp_set_relu_mask", &AtenIpexTypeMLPExt::set_relu_mask);
   m.def("mlp_release_handle", &AtenIpexTypeMLPExt::release_handle);
-
   m.def("is_dil_tensor", &isDilTensor);
   m.def("get_dil_tensor_sizes", &getDilTensorSizes);
   m.def("get_dil_tensor_strides", &getDilTensorStrides);
+  m.def("enable_jit", []() { AutoOptConfig::singleton().set_jit_fuse(true); });
+  m.def("disable_jit", []() { AutoOptConfig::singleton().set_jit_fuse(false); });
+  m.def("get_jit", []() { return AutoOptConfig::singleton().get_jit_fuse(); });
 }
 
 }  // namespace
-
-void InitIpexBindings(py::module m) { InitIpexModuleBindings(m); }
+using namespace torch::jit;
+
+void InitIpexBindings(py::module m) {
+  InitIpexModuleBindings(m);
+
+  // fro jit path
+  RegisterPass pass_1([](std::shared_ptr<Graph>& g) {
+    if (AutoOptConfig::singleton().get_jit_fuse()) {
+      torch::jit::OpRewritePass(g);
+    }
+  });
+  /*
+  RegisterPass pass_2([](std::shared_ptr<Graph>& g) {
+    if (AutoOptConfig::singleton().get_jit_fuse()) {
+      std::cout<<"uisng pass2"<<std::endl;
+      torch::jit::FormatOptimize(g);
+    }
+  });
+  */
+  // jit fusion pass
+  RegisterPass pass3([](std::shared_ptr<Graph>& g) {
+    if (AutoOptConfig::singleton().get_jit_fuse()) {
+      torch::jit::FusionPass(g);
+    }
+  });
+}
 
 }  // namespace torch_ipex
 
diff --git a/torch_ipex/csrc/jit/CMakeLists.txt b/torch_ipex/csrc/jit/CMakeLists.txt
new file mode 100644
index 000000000..58f3e2729
--- /dev/null
+++ b/torch_ipex/csrc/jit/CMakeLists.txt
@@ -0,0 +1,10 @@
+LIST(APPEND DPCPP_JIT_SRCS
+    ${DPCPP_ROOT}/jit/fusion_pass.cpp
+    ${DPCPP_ROOT}/jit/graph_ext.cpp
+    ${DPCPP_ROOT}/jit/op_rewrite.cpp
+    ${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp
+
+)
+
+# Pass to parent
+set(DPCPP_JIT_SRCS ${DPCPP_JIT_SRCS} PARENT_SCOPE)
diff --git a/torch_ipex/csrc/jit/accelerated_ops.h b/torch_ipex/csrc/jit/accelerated_ops.h
index 3d4b6944b..9183334f9 100644
--- a/torch_ipex/csrc/jit/accelerated_ops.h
+++ b/torch_ipex/csrc/jit/accelerated_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ideep.hpp>
+#include "cpu/dil/dil.hpp"
 #include <torch/csrc/jit/runtime/custom_operator.h>
 
 namespace torch { namespace jit {
diff --git a/torch_ipex/csrc/jit/dnnl_ops.h b/torch_ipex/csrc/jit/dnnl_ops.h
index 547c03675..abb7de3de 100644
--- a/torch_ipex/csrc/jit/dnnl_ops.h
+++ b/torch_ipex/csrc/jit/dnnl_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ideep.hpp>
+#include "cpu/dil/dil.hpp"
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
index 62ca9c86c..ef528e432 100644
--- a/torch_ipex/csrc/jit/fusion_pass.cpp
+++ b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -88,19 +88,20 @@ class OpFuser {
   // currently we only have to fold conv2d + batch_norm
   //
   bool isFoldable(Node* node, Node* prev) {
-    bool foldable = (node->kind() == dnnl::batch_norm
-        && prev->kind() == dnnl::conv2d);
-
+    bool foldable = (node->kind() == aten::batch_norm
+        && prev->kind() == aten::conv2d);
     //
     // Check whether all the sources are constant ???
     // Does performance improve no matter we do it pre-compiling or runtime?
     //
+
     auto* conv2d = reinterpret_cast<NodeExt *>(prev)->cast<Conv2dNode>();
     auto* batch_norm = reinterpret_cast<NodeExt *>(node)->cast<BatchNorm2dNode>();
 
     foldable = foldable
       && conv2d->hasConstantParams()
       && batch_norm->hasConstantParams();
+
     return foldable;
   }
 
@@ -125,6 +126,7 @@ class OpFuser {
     newNode->setScope(conv2d->scope());
 
     // We need following parameters
+
     newNode->addInput(conv2d->input(1));  // Conv2d weights
     newNode->addInput(batch_norm->input(1)); // Batch norm weights
     newNode->addInput(batch_norm->input(4)); // running_var (delta)
@@ -134,7 +136,6 @@ class OpFuser {
     newNode->output()->copyMetadata(conv2d->input(1));
     newNode->output()->setType(conv2d->input(1)->type());
     newNode->output()->setDebugName(conv2d->input(1)->debugName() + ".bn_folded");
-
     return newNode;
   }
 
@@ -198,7 +199,8 @@ class OpFuser {
     }
 
     // throw
-    auto er = script::ErrorReport(node->sourceRange());
+    //auto er = script::ErrorReport(node->sourceRange());
+    auto er = ErrorReport(node->sourceRange());
     er << "Schema not found for fusion process. \n";
     er << "Prev: " << *prev << "\n";
     er << "Node: " << *node << "\n";
@@ -323,7 +325,7 @@ class OpFuser {
     }
 
     return std::make_pair(++pos->iterator(), changed);
-}
+  }
 };
 
 // TODO: These rules should be more scalable
@@ -334,12 +336,14 @@ OpFuser::RuleTab OpFuser::dnnlRules = {
   {{dnnl::batch_norm, dnnl::relu}, dnnl::batch_norm_relu},
   {{dnnl::batch_norm, dnnl::relu_}, dnnl::batch_norm_relu},
   */
+  /*
   {{dnnl::conv2d_sum, dnnl::relu}, dnnl::conv2d_sum_relu},
   {{dnnl::conv2d_sum, dnnl::relu_}, dnnl::conv2d_sum_relu},
 
   {{dnnl::conv2d, dnnl::sum}, dnnl::conv2d_sum},
   {{dnnl::conv2d, dnnl::sum_}, dnnl::conv2d_sum},
-  // {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum}
+  {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum}
+  */
 };
 
 void FusionPass(std::shared_ptr<Graph> &graph) {
diff --git a/torch_ipex/csrc/jit/graph_ext.cpp b/torch_ipex/csrc/jit/graph_ext.cpp
index 46b6ef6bf..658aa4011 100644
--- a/torch_ipex/csrc/jit/graph_ext.cpp
+++ b/torch_ipex/csrc/jit/graph_ext.cpp
@@ -2,6 +2,7 @@
 #include "accelerated_ops.h"
 
 namespace torch { namespace jit {
+/*
 void NodeExt::initFormatInfo() {
   std::vector<int64_t> formatInfo (
       this->inputs().size() + this->outputs().size(),
@@ -9,7 +10,7 @@ void NodeExt::initFormatInfo() {
 
   this->is_(attr::format_info, std::move(formatInfo));
 }
-
+*/
 const std::vector<int64_t>& NodeExt::getFormatInfo() const {
   return this->is(attr::format_info);
 }
@@ -40,6 +41,7 @@ void NodeExt::setGroupInfo(int64_t groups) {
   this->i_(attr::group_info, groups);
 }
 
+/*
 Node *NodeExt::createReorder(Value *v, Graph *g, formatTag from, formatTag to) {
   NodeExt *reorder = nullptr;
   if (from != to) {
@@ -106,7 +108,7 @@ Node* NodeExt::appendReorder(formatTag to, int i) {
 
   return reorder;
 }
-
+*/
 void NodeExt::propagateFormats() {
   // TODO: Need consultant with acceleration libraries
   setOutputFormat(inputFormat());
@@ -185,7 +187,7 @@ bool Conv2dNode::hasConstantParams() const {
 
   return has;
 }
-
+/*
 formatTag Conv2dNode::expectedWeightFormat(
     c10::ArrayRef<int64_t> sizes,
     c10::List<int64_t> stride,
@@ -207,7 +209,6 @@ formatTag Conv2dNode::expectedWeightFormat(
   return desc.get_internal_format();
 }
 
-
 void Conv2dNode::fixWeightFormatIfPossible() {
   if (couldInferFormats()) {
     auto tensor = toIValue(this->input(1))->toTensor();
@@ -222,8 +223,9 @@ void Conv2dNode::fixWeightFormatIfPossible() {
     this->prependReorders(use_list {{this, 1}}, {natureWeightFormat}, {groups});
   }
 }
-
+*/
 bool BatchNorm2dNode::hasConstantParams() const {
+  /*
   bool has =
     this->input(1)->node()->kind() == prim::Constant
       && this->input(2)->node()->kind() == prim::Constant
@@ -234,6 +236,8 @@ bool BatchNorm2dNode::hasConstantParams() const {
   // TODO: more check to make sure
 
   return has;
+  */
+  return true;
 }
 
 }} // namespace torch::jit
diff --git a/torch_ipex/csrc/jit/graph_ext.h b/torch_ipex/csrc/jit/graph_ext.h
index 34a141854..74f762a37 100644
--- a/torch_ipex/csrc/jit/graph_ext.h
+++ b/torch_ipex/csrc/jit/graph_ext.h
@@ -2,21 +2,23 @@
 
 #include <vector>
 #include <memory>
-#include <ideep.hpp>
+
+#include "cpu/dil/dil.hpp"
 #include "accelerated_ops.h"
+
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/constants.h>
 
 namespace torch { namespace jit {
-using namespace ideep;
-using dataType = ideep::tensor::data_type;
-using formatTag = ideep::format;
+using namespace dil;
+using dataType = dil::tensor::data_type;
+using formatTag = dil::format_tag;
 using formatList = std::vector<formatTag>;
 using groupsList = std::vector<int64_t>;
 
-static constexpr auto natureFormat = formatTag::nchw;
-static constexpr auto natureWeightFormat = formatTag::oihw;
+//static constexpr auto natureFormat = formatTag::nchw;
+//static constexpr auto natureWeightFormat = formatTag::oihw;
 
 // attributes for pyrys ops to decide which format is on
 // Or what formats transfered by reorder
@@ -71,7 +73,7 @@ class NodeExt : public Node {
     return this->kind() == dnnl::batch_norm;
   }
 
-  void initFormatInfo();
+  //void initFormatInfo();
 
   template <class T> T* cast() {
     return reinterpret_cast<T*>(this);
@@ -79,18 +81,19 @@ class NodeExt : public Node {
 private:
   // we save formats as Ints attribute internally
   const std::vector<int64_t>& getFormatInfo() const;
-
+  /*
   static Node* createReorder(
       Value *v, Graph *g, formatTag from, formatTag to);
   static Node* insertReorder(
       Value *v, Node *insert_point, formatTag from, formatTag to);
+  */
 };
 
 class Conv2dNode : public NodeExt {
 public:
   bool couldInferFormats() const;
   bool hasConstantParams() const;
-  void fixWeightFormatIfPossible();
+  //void fixWeightFormatIfPossible();
   formatTag expectedWeightFormat(
       c10::ArrayRef<int64_t> sizes,
       c10::List<int64_t> stride,
@@ -99,6 +102,7 @@ class Conv2dNode : public NodeExt {
       int64_t groups, dataType dtype = dataType::f32) const;
 };
 
+
 class BatchNorm2dNode : public NodeExt {
 public:
   bool hasConstantParams() const;
diff --git a/torch_ipex/csrc/jit/init.cpp b/torch_ipex/csrc/jit/init.cpp
index 487e7019e..3316d4b4f 100644
--- a/torch_ipex/csrc/jit/init.cpp
+++ b/torch_ipex/csrc/jit/init.cpp
@@ -7,19 +7,22 @@
 #include <torch/csrc/jit/passes/pass_manager.h>
 
 #include "accelerated_ops.h"
-#include "op_rewrite.h"
-#include "format_analysis.h"
+//#include "op_rewrite.h"
+//#include "format_analysis.h"
 #include "fusion_pass.h"
-#include "dnnl_ops.h"
+//#include "dnnl_ops.h"
 
 namespace py = pybind11;
 using namespace torch::jit;
 
-static bool pyrys_enabled = false;
+//static bool jit_enabled = false;
+
+static bool jit_enabled = true;
 
 PYBIND11_MODULE(pyrys, m) {
   m.doc() = "A DO fusion backend for Pytorch JIT";
 
+  /*
   RegisterPass pass_1([](std::shared_ptr<Graph>& g) {
     if (pyrys_enabled) {
       torch::jit::OpRewritePass(g);
@@ -30,14 +33,17 @@ PYBIND11_MODULE(pyrys, m) {
       torch::jit::FormatOptimize(g);
     }
   });
+  */
   RegisterPass pass_3([](std::shared_ptr<Graph>& g) {
-    if (pyrys_enabled) {
+    if (jit_enabled) {
+    std::cout << "in init\n";
       torch::jit::FusionPass(g);
     }
   });
 
-  m.def("enable", []() { pyrys_enabled = true; });
-  m.def("disable", []() { pyrys_enabled = false; });
+  m.def("enable", []() { jit_enabled = true; });
+  m.def("disable", []() { jit_enabled = false; });
+  /*
   m.def("dnnl_conv2d", at::native::dnnl_conv2d, "A conv2d function of dnnl");
   m.def("dnnl_conv2d_relu", at::native::dnnl_conv2d_relu, "A conv2d_relu function of dnnl");
   m.def("dnnl_relu", at::native::dnnl_relu, "A relu function of dnnl");
@@ -45,4 +51,5 @@ PYBIND11_MODULE(pyrys, m) {
   m.def("dnnl_batch_norm", at::native::dnnl_batch_norm, "A batch_norm function of dnnl");
   m.def("dnnl_pooling_max_2d", at::native::dnnl_pooling_max_2d, "A max-pooling-2d funtion of dnnl");
   m.def("dnnl_pooling_avg_2d", at::native::dnnl_pooling_avg_2d, "An avg-pooling-2d funtion of dnnl");
+  */
 }
diff --git a/torch_ipex/csrc/jit/op_rewrite.cpp b/torch_ipex/csrc/jit/op_rewrite.cpp
index 481cba542..8a5a65efe 100644
--- a/torch_ipex/csrc/jit/op_rewrite.cpp
+++ b/torch_ipex/csrc/jit/op_rewrite.cpp
@@ -2,7 +2,7 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 
-#include <ideep.hpp>
+#include <cpu/dil/dil.hpp>
 
 #include "graph_ext.h"
 #include "op_rewrite.h"
@@ -24,7 +24,7 @@ NodeExt* replaceOpWithDNNL(Node *node, Graph *g) {
 
   auto* replacement = reinterpret_cast<NodeExt *>(
       replaceOpWithNewKind(node, g, rules.at(node->kind())));
-  replacement->initFormatInfo();
+  //replacement->initFormatInfo();
   return replacement;
 }
 
@@ -47,10 +47,11 @@ void OpRewritePass(Block *block) {
       // need a reorder to transform it back
       //
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
+      /*
       auto conv2d = newNode->cast<Conv2dNode>();
       conv2d->fixWeightFormatIfPossible();
       conv2d->appendReorder(natureFormat);
-
+      */
       // If we could get more information about the weights
       // We could prepend a reorder for the weights and constant propagation
       // might help us create a MKL-DNN friendly weight
@@ -62,20 +63,24 @@ void OpRewritePass(Block *block) {
       //
       auto lh_node = node->input(0)->node();
       auto rh_node = node->input(1)->node();
+      /*
       auto by_pass_reorder = [](const Node *n) {
         return (n->kind() == dnnl::reorder)
           ? n->input()->node() : n;
       };
-
+      */
       //
       // higher priority for conv+sum fusion than other kind
       // possibly we check whether there is a chance for conv+sum+relu
       //
+      /*
       if (by_pass_reorder(lh_node)->kind() == dnnl::conv2d
           || by_pass_reorder(rh_node)->kind() == dnnl::conv2d
           || by_pass_reorder(lh_node)->kind() == dnnl::batch_norm
           || by_pass_reorder(rh_node)->kind() == dnnl::batch_norm)
         replaceOpWithDNNL(node, block->owningGraph());
+      */
+      replaceOpWithDNNL(node, block->owningGraph());
     } else if (node->matches("aten::relu(Tensor self) -> Tensor")
         || node->matches("aten::relu_(Tensor(a!) self) -> Tensor(a!)")
         || node->matches(
@@ -102,10 +107,10 @@ void OpRewritePass(Block *block) {
       }
 
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
-      newNode->appendReorder(natureFormat);
+      //newNode->appendReorder(natureFormat);
     } else if (node->matches("aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride=[], int[] padding, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor")) {
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
-      newNode->appendReorder(natureFormat);
+      //newNode->appendReorder(natureFormat);
     }
   }
 }
diff --git a/torch_ipex/csrc/jit/op_rewrite.h b/torch_ipex/csrc/jit/op_rewrite.h
index cac609ba7..2c1ab26a8 100644
--- a/torch_ipex/csrc/jit/op_rewrite.h
+++ b/torch_ipex/csrc/jit/op_rewrite.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <memory>
-#include <ideep.hpp>
+#include "cpu/dil/dil.hpp"
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch { namespace jit {
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
index 487e2711d..9417ac716 100644
--- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
+++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -2,15 +2,15 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "accelerated_ops.h"
 #include "graph_ext.h"
-#include "dnnl_ops.h"
+#include "cpu/DevOPs.h"
+//#include "dnnl_ops.h"
+
 
 namespace torch {
 namespace jit {
 
-c10::OperatorOptions aliasAnalysisFromSchema() {
-  c10::OperatorOptions result;
-  result.setAliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA);
-  return result;
+c10::AliasAnalysisKind  aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
 }
 
 at::Tensor toOptionalTensor(const IValue& v) {
@@ -20,7 +20,7 @@ at::Tensor toOptionalTensor(const IValue& v) {
   return v.toTensor();
 }
 
-using namespace at::native;
+using namespace torch_ipex::cpu;
 
 RegisterOperators op({
     Operator(
@@ -32,8 +32,9 @@ RegisterOperators op({
           auto to = enode->inputFormat(1);
           auto groups = enode->getGroupInfo();
 
-          auto result = dnnl_reorder(
-              (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups);
+          // auto result = dnnl_reorder(
+          //    (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups);
+          auto result = at::Tensor();
           drop(stack, 1);
           pack(stack, std::move(result));
           return 0;
@@ -45,8 +46,7 @@ RegisterOperators op({
       "dnnl::relu(Tensor self) -> Tensor",
       [](const Node* node) -> Operation {
         return [] (Stack& stack) {
-          auto result = dnnl_relu(
-              (std::move(peek(stack, 0, 1))).toTensor());
+          auto result = AtenIpexCPUDev::dil_relu((std::move(peek(stack, 0, 1))).toTensor());
           drop(stack, 1);
           pack(stack, std::move(result));
           return 0;
@@ -60,7 +60,7 @@ RegisterOperators op({
         return [] (Stack& stack) {
           at::Tensor input;
           pop(stack, input);
-          auto result = dnnl_relu_(input);
+          auto result = AtenIpexCPUDev::dil_relu_(input);
           push(stack, std::move(result));
           return 0;
         };
@@ -71,7 +71,7 @@ RegisterOperators op({
       "dnnl::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       [] (const Node* node) -> Operation {
         return [] (Stack& stack) {
-          auto result = dnnl_conv2d(
+          auto result = AtenIpexCPUDev::dil_convolution(
               (std::move(peek(stack, 0, 7))).toTensor(),
               (std::move(peek(stack, 1, 7))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 2, 7))),
@@ -90,7 +90,7 @@ RegisterOperators op({
       "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       [] (const Node* node) ->Operation {
         return [] (Stack& stack) {
-          auto result = dnnl_conv2d_relu(
+          auto result = AtenIpexCPUDev::dil_convolution_relu(
               (std::move(peek(stack, 0, 7))).toTensor(),
               (std::move(peek(stack, 1, 7))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 2, 7))),
@@ -109,6 +109,7 @@ RegisterOperators op({
       "dnnl::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
       [] (const Node* node) ->Operation {
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_batch_norm(
               (std::move(peek(stack, 0, 9))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 1, 9))),
@@ -119,6 +120,8 @@ RegisterOperators op({
               (std::move(peek(stack, 6, 9))).toDouble(),
               (std::move(peek(stack, 7, 9))).toDouble(),
               (std::move(peek(stack, 8, 9))).toBool());
+          */
+          auto result = at::Tensor();
           drop(stack, 9);
           pack(stack, std::move(result));
           return 0;
@@ -130,11 +133,14 @@ RegisterOperators op({
       "dnnl::fold_weight(Tensor weight, Tensor? bn_weight, Tensor? running_var, float eps) -> Tensor",
       [] (const Node* node) -> Operation {
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_fold_weight(
               (std::move(peek(stack, 0, 4))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 1, 4))),
               toOptionalTensor(std::move(peek(stack, 2, 4))),
               (std::move(peek(stack, 3, 4))).toDouble());
+          */
+          auto result = at::Tensor();
           drop(stack, 4);
           pack(stack, std::move(result));
           return 0;
@@ -146,6 +152,7 @@ RegisterOperators op({
       "dnnl::fold_bias(Tensor weight, Tensor? bias, Tensor? bn_weight, Tensor? bn_bias, Tensor? running_mean, Tensor? running_var, float eps) -> Tensor",
       [] (const Node* node) -> Operation{
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_fold_bias(
               (std::move(peek(stack, 0, 7))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 1, 7))),
@@ -154,6 +161,8 @@ RegisterOperators op({
               toOptionalTensor(std::move(peek(stack, 4, 7))),
               toOptionalTensor(std::move(peek(stack, 5, 7))),
               (std::move(peek(stack, 6, 7))).toDouble());
+          */
+          auto result = at::Tensor();
           drop(stack, 7);
           pack(stack, std::move(result));
           return 0;
@@ -165,11 +174,14 @@ RegisterOperators op({
       "dnnl::sum(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor",
       [] (const Node* node) ->Operation {
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_sum(
               (std::move(peek(stack, 0, 3))).toTensor(),
               (std::move(peek(stack, 1, 3))).toTensor(),
               (std::move(peek(stack, 2, 3))).toScalar()
           );
+          */
+          auto result = at::Tensor();
           drop(stack, 3);
           pack(stack, std::move(result));
           return 0;
@@ -182,10 +194,13 @@ RegisterOperators op({
       [] (const Node* node) ->Operation{
         return [](Stack &stack) {
           auto self = (std::move(peek(stack, 0, 3))).toTensor();
+          /*
           auto result = dnnl_sum_(
               self,
               (std::move(peek(stack, 1, 3))).toTensor(),
               (std::move(peek(stack, 2, 3))).toScalar());
+          */
+          auto result = at::Tensor();
           drop(stack, 3);
           pack(stack, std::move(result));
           return 0;
@@ -193,6 +208,7 @@ RegisterOperators op({
       },
       aliasAnalysisFromSchema()
       ),
+    /*
     Operator(
       "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)",
       [] (const Node* node) ->Operation {
@@ -209,6 +225,7 @@ RegisterOperators op({
               output,
               (std::move(peek(stack, 8, 9))).toScalar()
           );
+          auto result = at::Tensor();
           drop(stack, 9);
           pack(stack, std::move(result));
           return 0;
@@ -232,17 +249,19 @@ RegisterOperators op({
               output,
               (std::move(peek(stack, 8, 9))).toScalar()
           );
+          auto result = at::Tensor();
           drop(stack, 9);
           pack(stack, std::move(result));
           return 0;
         };
       },
       aliasAnalysisFromSchema()
-      ),
+      ),*/
     Operator(
       "dnnl::pooling_max_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=0) -> Tensor(a!)",
       [] (const Node *node) ->Operation {
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_pooling_max_2d(
               (std::move(peek(stack, 0, 6))).toTensor(),      // Input tensor
               (std::move(peek(stack, 1, 6))).toIntVector(),  // Kernel size
@@ -250,6 +269,8 @@ RegisterOperators op({
               (std::move(peek(stack, 3, 6))).toIntVector(),  // Padding
               (std::move(peek(stack, 4, 6))).toIntVector(),  // Dilation
               (std::move(peek(stack, 5, 6))).toBool());       // Ceil mode
+          */
+          auto result = at::Tensor();
           drop(stack, 6);
           pack(stack, std::move(result));
           return 0;
@@ -261,12 +282,15 @@ RegisterOperators op({
       "dnnl::pooling_avg_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, bool ceil_mode=0, bool count_include_pad=True, int? divisor_override=None) -> Tensor(a!)",
       [] (const Node *node) ->Operation {
         return [] (Stack& stack) {
+          /*
           auto result = dnnl_pooling_avg_2d(
               (std::move(peek(stack, 0, 7))).toTensor(),      // Input tensor
               (std::move(peek(stack, 1, 7))).toIntVector(),  // Kernel size
               (std::move(peek(stack, 2, 7))).toIntVector(),  // Stride
               (std::move(peek(stack, 3, 7))).toIntVector(),  // Padding
               (std::move(peek(stack, 4, 7))).toBool());       // Ceil mode
+          */
+          auto result = at::Tensor();
           drop(stack, 7);
           pack(stack, std::move(result));
           return 0;

From bbd70f8e4b8e107bdbf95c5b0fdeeeab56c1ac98 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Tue, 19 May 2020 18:30:03 +0800
Subject: [PATCH 02/10] only jit fusion for extension path

---
 torch_ipex/csrc/cpu/DevOPs.cpp                |  35 ---
 torch_ipex/csrc/cpu/DevOPs.h                  |   3 -
 torch_ipex/csrc/cpu/FusionOPs.cpp             |  59 ++++
 torch_ipex/csrc/cpu/FusionOPs.h               |  35 +++
 torch_ipex/csrc/init_python_bindings.cpp      |  18 +-
 torch_ipex/csrc/jit/CMakeLists.txt            |   2 -
 torch_ipex/csrc/jit/accelerated_ops.h         |   2 +-
 torch_ipex/csrc/jit/dnnl_ops.h                |   2 +-
 torch_ipex/csrc/jit/fusion_pass.cpp           | 152 ++++-------
 torch_ipex/csrc/jit/graph_ext.cpp             |  13 +-
 torch_ipex/csrc/jit/graph_ext.h               |  22 +-
 torch_ipex/csrc/jit/init.cpp                  |  21 +-
 torch_ipex/csrc/jit/op_rewrite.cpp            |  17 +-
 torch_ipex/csrc/jit/op_rewrite.h              |   2 +-
 torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 256 ++----------------
 15 files changed, 198 insertions(+), 441 deletions(-)
 create mode 100644 torch_ipex/csrc/cpu/FusionOPs.cpp
 create mode 100644 torch_ipex/csrc/cpu/FusionOPs.h

diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
index a162d7863..ebe231f42 100644
--- a/torch_ipex/csrc/cpu/DevOPs.cpp
+++ b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -67,41 +67,6 @@ at::Tensor AtenIpexCPUDev::dil_convolution(
   return dbl::comm::gen_aten_tensor_by(std::move(dil_output));
 }
 
-at::Tensor AtenIpexCPUDev::dil_convolution_relu(
-    const at::Tensor & input,
-    const at::Tensor & weight,
-    const at::Tensor & bias,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    at::IntArrayRef dilation,
-    int64_t groups) {
-  DEBUG("AtenIpexCPUDev::dil_convolution\n");
-  dil::tensor dil_input;
-  dil::tensor dil_weight;
-  c10::optional<dil::tensor> dil_bias{c10::nullopt};
-
-  CHECK_DNNL_OP_PRE_COND(input);
-  CHECK_DNNL_OP_PRE_COND(weight);
-  dil_input = dbl::comm::try_gen_dil_tensor(input);
-  dil_weight = dbl::comm::try_gen_dil_tensor(weight);
-  if (bias.defined()) {
-    CHECK_DNNL_OP_PRE_COND(bias);
-    dil_bias = dbl::comm::try_gen_dil_tensor(bias);
-  }
-
-  dil::tensor dil_output = dbl::conv::conv2d_impl(
-    dil_input,
-    dil_weight,
-    dil_bias,
-    padding,
-    stride,
-    dilation,
-    groups,
-    dil::attr_t::fuse_relu());
-
-  return dbl::comm::gen_aten_tensor_by(dil_output);
-}
-
 at::Tensor dil_convolution_backward_input(
     at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool bias_defined)
diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h
index 856bee0a7..941bf93a8 100644
--- a/torch_ipex/csrc/cpu/DevOPs.h
+++ b/torch_ipex/csrc/cpu/DevOPs.h
@@ -70,9 +70,6 @@ class AtenIpexCPUDev {
   static std::vector<at::Tensor> dil_split_with_sizes(const at::Tensor& self, at::IntArrayRef split_sizes, int64_t dim);
   static std::vector<at::Tensor> dil_split(const at::Tensor& self, int64_t split_size, int64_t dim);
 
-  // for JIT ops
-  static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
-
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
new file mode 100644
index 000000000..aa06f5105
--- /dev/null
+++ b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -0,0 +1,59 @@
+#include "torch_ipex/csrc/cpu/FusionOPs.h"
+
+#include <ATen/Context.h>
+#include <ATen/CPUGenerator.h>
+#include <ATen/InferSize.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+
+#include <limits>
+
+#include "torch_ipex/csrc/aten_ipex_bridge.h"
+#include "torch_ipex/csrc/ipex_tensor_impl.h"
+#include "torch_ipex/csrc/utils.h"
+#include "dbl/Common.h"
+#include "dbl/Conv.h"
+#include "ShadeDataContext.h"
+
+#include "dil/dil.hpp"
+
+namespace torch_ipex {
+namespace cpu {
+
+at::Tensor AtenIpexJITDev::dil_convolution_relu(
+    const at::Tensor & input,
+    const at::Tensor & weight,
+    const at::Tensor & bias,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    int64_t groups) {
+  dil::tensor dil_input;
+  dil::tensor dil_weight;
+  c10::optional<dil::tensor> dil_bias{c10::nullopt};
+
+  auto input_contiguous = input.contiguous();
+  auto weight_contiguous = weight.contiguous();
+
+  dil_input = dbl::comm::try_gen_dil_tensor(input_contiguous);
+  dil_weight = dbl::comm::try_gen_dil_tensor(weight_contiguous);
+  if (bias.defined()) {
+    auto bias_contiguous = bias.contiguous();
+    dil_bias = dbl::comm::try_gen_dil_tensor(bias_contiguous);
+  }
+
+  dil::tensor dil_output = dbl::conv::conv2d_impl(
+    dil_input,
+    dil_weight,
+    dil_bias,
+    padding,
+    stride,
+    dilation,
+    groups,
+    dil::attr_t::fuse_relu());
+
+  return dbl::comm::gen_aten_tensor_by(dil_output);
+}
+
+}  // namespace cpu
+}  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/FusionOPs.h b/torch_ipex/csrc/cpu/FusionOPs.h
new file mode 100644
index 000000000..14f3db7e1
--- /dev/null
+++ b/torch_ipex/csrc/cpu/FusionOPs.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+#include <torch/csrc/jit/runtime/custom_operator.h>
+
+#include "dil/dil.hpp"
+
+namespace torch { namespace jit {
+
+// XXX: PyTorch does not support nesting namespace
+// And the alias analysis is not working for namespace other than aten ...
+// So we fake some op namespaces to workaround that.
+namespace dnnl {
+  static auto conv2d_relu = Symbol::fromQualString("dnnl::conv2d_relu");
+  static auto conv2d_sum = Symbol::fromQualString("dnnl::conv2d_sum");
+  static auto conv2d_relu_sum = Symbol::fromQualString("dnnl::conv2d_relu_sum");
+  static auto conv2d_sum_relu = Symbol::fromQualString("dnnl::conv2d_sum_relu");
+
+}
+
+}} // namespace torch::jit
+
+namespace torch_ipex {
+namespace cpu {
+
+class AtenIpexJITDev {
+ public:
+  // for JIT ops
+  static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
+
+};
+
+}  // namespace cpu
+}  // namespace torch_ipex
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
index 4ffe64246..33492bf09 100644
--- a/torch_ipex/csrc/init_python_bindings.cpp
+++ b/torch_ipex/csrc/init_python_bindings.cpp
@@ -10,7 +10,6 @@
 #include <torch/csrc/jit/runtime/operator_options.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include "jit/fusion_pass.h"
-#include "jit/op_rewrite.h"
 
 #include <cstring>
 #include <sstream>
@@ -148,23 +147,8 @@ using namespace torch::jit;
 
 void InitIpexBindings(py::module m) {
   InitIpexModuleBindings(m);
-
-  // fro jit path
-  RegisterPass pass_1([](std::shared_ptr<Graph>& g) {
-    if (AutoOptConfig::singleton().get_jit_fuse()) {
-      torch::jit::OpRewritePass(g);
-    }
-  });
-  /*
-  RegisterPass pass_2([](std::shared_ptr<Graph>& g) {
-    if (AutoOptConfig::singleton().get_jit_fuse()) {
-      std::cout<<"uisng pass2"<<std::endl;
-      torch::jit::FormatOptimize(g);
-    }
-  });
-  */
   // jit fusion pass
-  RegisterPass pass3([](std::shared_ptr<Graph>& g) {
+  RegisterPass pass([](std::shared_ptr<Graph>& g) {
     if (AutoOptConfig::singleton().get_jit_fuse()) {
       torch::jit::FusionPass(g);
     }
diff --git a/torch_ipex/csrc/jit/CMakeLists.txt b/torch_ipex/csrc/jit/CMakeLists.txt
index 58f3e2729..3f313b336 100644
--- a/torch_ipex/csrc/jit/CMakeLists.txt
+++ b/torch_ipex/csrc/jit/CMakeLists.txt
@@ -1,7 +1,5 @@
 LIST(APPEND DPCPP_JIT_SRCS
     ${DPCPP_ROOT}/jit/fusion_pass.cpp
-    ${DPCPP_ROOT}/jit/graph_ext.cpp
-    ${DPCPP_ROOT}/jit/op_rewrite.cpp
     ${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp
 
 )
diff --git a/torch_ipex/csrc/jit/accelerated_ops.h b/torch_ipex/csrc/jit/accelerated_ops.h
index 9183334f9..3d4b6944b 100644
--- a/torch_ipex/csrc/jit/accelerated_ops.h
+++ b/torch_ipex/csrc/jit/accelerated_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cpu/dil/dil.hpp"
+#include <ideep.hpp>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 
 namespace torch { namespace jit {
diff --git a/torch_ipex/csrc/jit/dnnl_ops.h b/torch_ipex/csrc/jit/dnnl_ops.h
index abb7de3de..547c03675 100644
--- a/torch_ipex/csrc/jit/dnnl_ops.h
+++ b/torch_ipex/csrc/jit/dnnl_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cpu/dil/dil.hpp"
+#include <ideep.hpp>
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
index ef528e432..9568e1015 100644
--- a/torch_ipex/csrc/jit/fusion_pass.cpp
+++ b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -1,7 +1,8 @@
 #include <string>
-#include "graph_ext.h"
 #include "fusion_pass.h"
-#include "accelerated_ops.h"
+
+#include "cpu/FusionOPs.h"
+
 #include <torch/csrc/utils/hash.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
@@ -80,85 +81,38 @@ class OpFuser {
     aliasDb_ = std::make_unique<AliasDb>(graph_);
   }
 
-  Node* fuseNodes(Node *curr, Value *path, Rule rule) {
-    return fuseOpsWithNewKind(curr, path, curr->owningGraph(), rule->second);
-  }
-
-  //
-  // currently we only have to fold conv2d + batch_norm
-  //
-  bool isFoldable(Node* node, Node* prev) {
-    bool foldable = (node->kind() == aten::batch_norm
-        && prev->kind() == aten::conv2d);
-    //
-    // Check whether all the sources are constant ???
-    // Does performance improve no matter we do it pre-compiling or runtime?
-    //
-
-    auto* conv2d = reinterpret_cast<NodeExt *>(prev)->cast<Conv2dNode>();
-    auto* batch_norm = reinterpret_cast<NodeExt *>(node)->cast<BatchNorm2dNode>();
-
-    foldable = foldable
-      && conv2d->hasConstantParams()
-      && batch_norm->hasConstantParams();
-
-    return foldable;
-  }
-
-  Node* foldNodes(Node *conv2d, Node *batch_norm) {
-    // Change weight/bias source
-    auto* fold_weight = createBatchNormFoldWeight(conv2d, batch_norm);
-    fold_weight->insertBefore(conv2d);
-    conv2d->replaceInput(1, fold_weight->output());
+  Node* fuseOpsWithNewKind(Node *curr, Value *v, Graph *g, NodeKind kind) {
+    auto newNode = g->create(kind);
+    auto prev = v->node();
+    newNode->insertBefore(prev);
+    newNode->setScope(prev->scope());
+    newNode->copyAttributes(*prev);
 
-    auto* fold_bias = createBatchNormFoldBias(conv2d, batch_norm);
-    fold_bias->insertBefore(conv2d);
-    conv2d->replaceInput(2, fold_bias->output());
+    for (auto input : prev->inputs()) {
+      newNode->addInput(input);
+    }
 
-    batch_norm->replaceAllUsesWith(conv2d);
-    batch_norm->destroy();
-    return conv2d;
-  }
+    for (auto input : curr->inputs()) {
+      if (input != v) {
+        newNode->addInput(input);
+      }
+    }
 
-  Node* createBatchNormFoldWeight(Node *conv2d, Node *batch_norm) {
-    auto g = conv2d->owningGraph();
-    auto newNode = g->create(dnnl::fold_weight);
-    newNode->setScope(conv2d->scope());
+    // Copy curr or prev?
+    newNode->output()->copyMetadata(prev->output());
+    newNode->output()->setType(prev->output()->type());
 
-    // We need following parameters
+    v->replaceAllUsesWith(newNode->output());
+    curr->replaceAllUsesWith(newNode);
 
-    newNode->addInput(conv2d->input(1));  // Conv2d weights
-    newNode->addInput(batch_norm->input(1)); // Batch norm weights
-    newNode->addInput(batch_norm->input(4)); // running_var (delta)
-    newNode->addInput(batch_norm->input(7)); // eps
+    prev->destroy();
+    curr->destroy();
 
-    // We get meta and type from conv2d weight value
-    newNode->output()->copyMetadata(conv2d->input(1));
-    newNode->output()->setType(conv2d->input(1)->type());
-    newNode->output()->setDebugName(conv2d->input(1)->debugName() + ".bn_folded");
     return newNode;
   }
 
-  Node* createBatchNormFoldBias(Node *conv2d, Node *batch_norm) {
-    auto g = conv2d->owningGraph();
-    auto newNode = g->create(dnnl::fold_bias);
-    newNode->setScope(conv2d->scope());
-
-    // We need following information
-    newNode->addInput(conv2d->input(1)); // Conv weight
-    newNode->addInput(conv2d->input(2)); // Conv bias
-    newNode->addInput(batch_norm->input(1)); // batch norm weight
-    newNode->addInput(batch_norm->input(2)); // batch norm bias
-    newNode->addInput(batch_norm->input(3)); // running_mean (mu)
-    newNode->addInput(batch_norm->input(4)); // running_var (delta)
-    newNode->addInput(batch_norm->input(7)); // eps
-
-    // We get meta and type from conv2d bias value
-    newNode->output()->copyMetadata(conv2d->input(2));
-    newNode->output()->setType(conv2d->input(2)->type());
-    newNode->output()->setDebugName(conv2d->input(2)->debugName() + ".bn_folded");
-
-    return newNode;
+  Node* fuseNodes(Node *curr, Value *path, Rule rule) {
+    return fuseOpsWithNewKind(curr, path, curr->owningGraph(), rule->second);
   }
 
   bool aliasIsSafeForSquashingValue(Node *node, Value *v) {
@@ -199,7 +153,6 @@ class OpFuser {
     }
 
     // throw
-    //auto er = script::ErrorReport(node->sourceRange());
     auto er = ErrorReport(node->sourceRange());
     er << "Schema not found for fusion process. \n";
     er << "Prev: " << *prev << "\n";
@@ -297,52 +250,40 @@ class OpFuser {
   }
 
   std::pair<graph_node_list::iterator, bool> processNode(Node *node) {
-    auto nodeExt = reinterpret_cast<NodeExt *>(node);
 
     Node* pos = node;
     bool changed = false;
 
-    if (nodeExt->isDNNLOps()) {
-      //
-      // Check whether we could fuse to one certain value path
-      //
-      for (auto *v : node->inputs()) {
-        auto prev = v->node();
-        auto fuseRule = isFusable(node, prev);
-
-        // We can fuse only one path
-        if (fuseRule && aliasIsSafeForFusion(node, v, fuseRule)) {
-          pos = fuseNodes(node, v, fuseRule.value());
-          changed = true;
-          break;
-        } else if (isFoldable(node, prev)
-            && aliasIsSafeForSquashingValue(node, v)) {
-          pos = foldNodes(prev, node);
-          changed = true;
-          break;
-        }
+    //
+    // Check whether we could fuse to one certain value path
+    //
+    for (auto *v : node->inputs()) {
+      auto prev = v->node();
+      auto fuseRule = isFusable(node, prev);
+
+      // We can fuse only one path
+      if (fuseRule && aliasIsSafeForFusion(node, v, fuseRule)) {
+        pos = fuseNodes(node, v, fuseRule.value());
+        changed = true;
+        break;
       }
     }
-
     return std::make_pair(++pos->iterator(), changed);
   }
+
 };
 
 // TODO: These rules should be more scalable
 OpFuser::RuleTab OpFuser::dnnlRules = {
-  {{dnnl::conv2d, dnnl::relu}, dnnl::conv2d_relu},
-  {{dnnl::conv2d, dnnl::relu_}, dnnl::conv2d_relu},
-  /*
-  {{dnnl::batch_norm, dnnl::relu}, dnnl::batch_norm_relu},
-  {{dnnl::batch_norm, dnnl::relu_}, dnnl::batch_norm_relu},
-  */
+  {{aten::conv2d, aten::relu}, dnnl::conv2d_relu},
+  {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, dnnl::conv2d_relu},
   /*
-  {{dnnl::conv2d_sum, dnnl::relu}, dnnl::conv2d_sum_relu},
-  {{dnnl::conv2d_sum, dnnl::relu_}, dnnl::conv2d_sum_relu},
+  {{AtenIpexCPUDev::conv2d_sum, AtenIpexCPUDev::relu}, AtenIpexCPUDev::conv2d_sum_relu},
+  {{AtenIpexCPUDev::conv2d_sum, dnnl::relu_}, AtenIpexCPUDev::conv2d_sum_relu},
 
-  {{dnnl::conv2d, dnnl::sum}, dnnl::conv2d_sum},
-  {{dnnl::conv2d, dnnl::sum_}, dnnl::conv2d_sum},
-  {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum}
+  {{aten::conv2d, aten::add}, AtenIpexCPUDev::conv2d_sum},
+  {{aten::conv2d, aten::add_}, AtenIpexCPUDev::conv2d_sum},
+  {{AtenIpexCPUDev::conv2d_relu, aten::add}, AtenIpexCPUDev::conv2d_relu_sum}
   */
 };
 
@@ -355,4 +296,5 @@ void FusionPass(std::shared_ptr<Graph> &graph) {
   // TODO: Some post processing?? ECS/EDC/Peephole???
   ConstantPropagation(graph);
 }
+
 }} // namespace torch::jit
diff --git a/torch_ipex/csrc/jit/graph_ext.cpp b/torch_ipex/csrc/jit/graph_ext.cpp
index 658aa4011..efbec2cf8 100644
--- a/torch_ipex/csrc/jit/graph_ext.cpp
+++ b/torch_ipex/csrc/jit/graph_ext.cpp
@@ -2,7 +2,6 @@
 #include "accelerated_ops.h"
 
 namespace torch { namespace jit {
-/*
 void NodeExt::initFormatInfo() {
   std::vector<int64_t> formatInfo (
       this->inputs().size() + this->outputs().size(),
@@ -10,7 +9,7 @@ void NodeExt::initFormatInfo() {
 
   this->is_(attr::format_info, std::move(formatInfo));
 }
-*/
+
 const std::vector<int64_t>& NodeExt::getFormatInfo() const {
   return this->is(attr::format_info);
 }
@@ -41,7 +40,6 @@ void NodeExt::setGroupInfo(int64_t groups) {
   this->i_(attr::group_info, groups);
 }
 
-/*
 Node *NodeExt::createReorder(Value *v, Graph *g, formatTag from, formatTag to) {
   NodeExt *reorder = nullptr;
   if (from != to) {
@@ -108,7 +106,7 @@ Node* NodeExt::appendReorder(formatTag to, int i) {
 
   return reorder;
 }
-*/
+
 void NodeExt::propagateFormats() {
   // TODO: Need consultant with acceleration libraries
   setOutputFormat(inputFormat());
@@ -187,7 +185,7 @@ bool Conv2dNode::hasConstantParams() const {
 
   return has;
 }
-/*
+
 formatTag Conv2dNode::expectedWeightFormat(
     c10::ArrayRef<int64_t> sizes,
     c10::List<int64_t> stride,
@@ -223,9 +221,8 @@ void Conv2dNode::fixWeightFormatIfPossible() {
     this->prependReorders(use_list {{this, 1}}, {natureWeightFormat}, {groups});
   }
 }
-*/
+
 bool BatchNorm2dNode::hasConstantParams() const {
-  /*
   bool has =
     this->input(1)->node()->kind() == prim::Constant
       && this->input(2)->node()->kind() == prim::Constant
@@ -236,8 +233,6 @@ bool BatchNorm2dNode::hasConstantParams() const {
   // TODO: more check to make sure
 
   return has;
-  */
-  return true;
 }
 
 }} // namespace torch::jit
diff --git a/torch_ipex/csrc/jit/graph_ext.h b/torch_ipex/csrc/jit/graph_ext.h
index 74f762a37..34a141854 100644
--- a/torch_ipex/csrc/jit/graph_ext.h
+++ b/torch_ipex/csrc/jit/graph_ext.h
@@ -2,23 +2,21 @@
 
 #include <vector>
 #include <memory>
-
-#include "cpu/dil/dil.hpp"
+#include <ideep.hpp>
 #include "accelerated_ops.h"
-
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/constants.h>
 
 namespace torch { namespace jit {
-using namespace dil;
-using dataType = dil::tensor::data_type;
-using formatTag = dil::format_tag;
+using namespace ideep;
+using dataType = ideep::tensor::data_type;
+using formatTag = ideep::format;
 using formatList = std::vector<formatTag>;
 using groupsList = std::vector<int64_t>;
 
-//static constexpr auto natureFormat = formatTag::nchw;
-//static constexpr auto natureWeightFormat = formatTag::oihw;
+static constexpr auto natureFormat = formatTag::nchw;
+static constexpr auto natureWeightFormat = formatTag::oihw;
 
 // attributes for pyrys ops to decide which format is on
 // Or what formats transfered by reorder
@@ -73,7 +71,7 @@ class NodeExt : public Node {
     return this->kind() == dnnl::batch_norm;
   }
 
-  //void initFormatInfo();
+  void initFormatInfo();
 
   template <class T> T* cast() {
     return reinterpret_cast<T*>(this);
@@ -81,19 +79,18 @@ class NodeExt : public Node {
 private:
   // we save formats as Ints attribute internally
   const std::vector<int64_t>& getFormatInfo() const;
-  /*
+
   static Node* createReorder(
       Value *v, Graph *g, formatTag from, formatTag to);
   static Node* insertReorder(
       Value *v, Node *insert_point, formatTag from, formatTag to);
-  */
 };
 
 class Conv2dNode : public NodeExt {
 public:
   bool couldInferFormats() const;
   bool hasConstantParams() const;
-  //void fixWeightFormatIfPossible();
+  void fixWeightFormatIfPossible();
   formatTag expectedWeightFormat(
       c10::ArrayRef<int64_t> sizes,
       c10::List<int64_t> stride,
@@ -102,7 +99,6 @@ class Conv2dNode : public NodeExt {
       int64_t groups, dataType dtype = dataType::f32) const;
 };
 
-
 class BatchNorm2dNode : public NodeExt {
 public:
   bool hasConstantParams() const;
diff --git a/torch_ipex/csrc/jit/init.cpp b/torch_ipex/csrc/jit/init.cpp
index 3316d4b4f..487e7019e 100644
--- a/torch_ipex/csrc/jit/init.cpp
+++ b/torch_ipex/csrc/jit/init.cpp
@@ -7,22 +7,19 @@
 #include <torch/csrc/jit/passes/pass_manager.h>
 
 #include "accelerated_ops.h"
-//#include "op_rewrite.h"
-//#include "format_analysis.h"
+#include "op_rewrite.h"
+#include "format_analysis.h"
 #include "fusion_pass.h"
-//#include "dnnl_ops.h"
+#include "dnnl_ops.h"
 
 namespace py = pybind11;
 using namespace torch::jit;
 
-//static bool jit_enabled = false;
-
-static bool jit_enabled = true;
+static bool pyrys_enabled = false;
 
 PYBIND11_MODULE(pyrys, m) {
   m.doc() = "A DO fusion backend for Pytorch JIT";
 
-  /*
   RegisterPass pass_1([](std::shared_ptr<Graph>& g) {
     if (pyrys_enabled) {
       torch::jit::OpRewritePass(g);
@@ -33,17 +30,14 @@ PYBIND11_MODULE(pyrys, m) {
       torch::jit::FormatOptimize(g);
     }
   });
-  */
   RegisterPass pass_3([](std::shared_ptr<Graph>& g) {
-    if (jit_enabled) {
-    std::cout << "in init\n";
+    if (pyrys_enabled) {
       torch::jit::FusionPass(g);
     }
   });
 
-  m.def("enable", []() { jit_enabled = true; });
-  m.def("disable", []() { jit_enabled = false; });
-  /*
+  m.def("enable", []() { pyrys_enabled = true; });
+  m.def("disable", []() { pyrys_enabled = false; });
   m.def("dnnl_conv2d", at::native::dnnl_conv2d, "A conv2d function of dnnl");
   m.def("dnnl_conv2d_relu", at::native::dnnl_conv2d_relu, "A conv2d_relu function of dnnl");
   m.def("dnnl_relu", at::native::dnnl_relu, "A relu function of dnnl");
@@ -51,5 +45,4 @@ PYBIND11_MODULE(pyrys, m) {
   m.def("dnnl_batch_norm", at::native::dnnl_batch_norm, "A batch_norm function of dnnl");
   m.def("dnnl_pooling_max_2d", at::native::dnnl_pooling_max_2d, "A max-pooling-2d funtion of dnnl");
   m.def("dnnl_pooling_avg_2d", at::native::dnnl_pooling_avg_2d, "An avg-pooling-2d funtion of dnnl");
-  */
 }
diff --git a/torch_ipex/csrc/jit/op_rewrite.cpp b/torch_ipex/csrc/jit/op_rewrite.cpp
index 8a5a65efe..481cba542 100644
--- a/torch_ipex/csrc/jit/op_rewrite.cpp
+++ b/torch_ipex/csrc/jit/op_rewrite.cpp
@@ -2,7 +2,7 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 
-#include <cpu/dil/dil.hpp>
+#include <ideep.hpp>
 
 #include "graph_ext.h"
 #include "op_rewrite.h"
@@ -24,7 +24,7 @@ NodeExt* replaceOpWithDNNL(Node *node, Graph *g) {
 
   auto* replacement = reinterpret_cast<NodeExt *>(
       replaceOpWithNewKind(node, g, rules.at(node->kind())));
-  //replacement->initFormatInfo();
+  replacement->initFormatInfo();
   return replacement;
 }
 
@@ -47,11 +47,10 @@ void OpRewritePass(Block *block) {
       // need a reorder to transform it back
       //
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
-      /*
       auto conv2d = newNode->cast<Conv2dNode>();
       conv2d->fixWeightFormatIfPossible();
       conv2d->appendReorder(natureFormat);
-      */
+
       // If we could get more information about the weights
       // We could prepend a reorder for the weights and constant propagation
       // might help us create a MKL-DNN friendly weight
@@ -63,24 +62,20 @@ void OpRewritePass(Block *block) {
       //
       auto lh_node = node->input(0)->node();
       auto rh_node = node->input(1)->node();
-      /*
       auto by_pass_reorder = [](const Node *n) {
         return (n->kind() == dnnl::reorder)
           ? n->input()->node() : n;
       };
-      */
+
       //
       // higher priority for conv+sum fusion than other kind
       // possibly we check whether there is a chance for conv+sum+relu
       //
-      /*
       if (by_pass_reorder(lh_node)->kind() == dnnl::conv2d
           || by_pass_reorder(rh_node)->kind() == dnnl::conv2d
           || by_pass_reorder(lh_node)->kind() == dnnl::batch_norm
           || by_pass_reorder(rh_node)->kind() == dnnl::batch_norm)
         replaceOpWithDNNL(node, block->owningGraph());
-      */
-      replaceOpWithDNNL(node, block->owningGraph());
     } else if (node->matches("aten::relu(Tensor self) -> Tensor")
         || node->matches("aten::relu_(Tensor(a!) self) -> Tensor(a!)")
         || node->matches(
@@ -107,10 +102,10 @@ void OpRewritePass(Block *block) {
       }
 
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
-      //newNode->appendReorder(natureFormat);
+      newNode->appendReorder(natureFormat);
     } else if (node->matches("aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride=[], int[] padding, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor")) {
       auto newNode = replaceOpWithDNNL(node, block->owningGraph());
-      //newNode->appendReorder(natureFormat);
+      newNode->appendReorder(natureFormat);
     }
   }
 }
diff --git a/torch_ipex/csrc/jit/op_rewrite.h b/torch_ipex/csrc/jit/op_rewrite.h
index 2c1ab26a8..cac609ba7 100644
--- a/torch_ipex/csrc/jit/op_rewrite.h
+++ b/torch_ipex/csrc/jit/op_rewrite.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <memory>
-#include "cpu/dil/dil.hpp"
+#include <ideep.hpp>
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch { namespace jit {
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
index 9417ac716..26760b9df 100644
--- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
+++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -1,9 +1,10 @@
-#include "torch/csrc/jit/runtime/operator.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "accelerated_ops.h"
-#include "graph_ext.h"
-#include "cpu/DevOPs.h"
-//#include "dnnl_ops.h"
+#include <c10/util/Exception.h>
+
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+
+#include "torch_ipex/csrc/utils.h"
+#include "cpu/FusionOPs.h"
 
 
 namespace torch {
@@ -23,198 +24,36 @@ at::Tensor toOptionalTensor(const IValue& v) {
 using namespace torch_ipex::cpu;
 
 RegisterOperators op({
-    Operator(
-      "dnnl::reorder(Tensor self) -> Tensor",
-      [](const Node* node) -> Operation {
-        return [node] (Stack& stack) {
-          auto* enode = reinterpret_cast<const NodeExt *>(node);
-          auto from = enode->inputFormat(0);
-          auto to = enode->inputFormat(1);
-          auto groups = enode->getGroupInfo();
-
-          // auto result = dnnl_reorder(
-          //    (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups);
-          auto result = at::Tensor();
-          drop(stack, 1);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::relu(Tensor self) -> Tensor",
-      [](const Node* node) -> Operation {
-        return [] (Stack& stack) {
-          auto result = AtenIpexCPUDev::dil_relu((std::move(peek(stack, 0, 1))).toTensor());
-          drop(stack, 1);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::relu_(Tensor(a!) self) -> Tensor(a!)",
-      [] (const Node* node) -> Operation {
-        return [] (Stack& stack) {
-          at::Tensor input;
-          pop(stack, input);
-          auto result = AtenIpexCPUDev::dil_relu_(input);
-          push(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
-      [] (const Node* node) -> Operation {
-        return [] (Stack& stack) {
-          auto result = AtenIpexCPUDev::dil_convolution(
-              (std::move(peek(stack, 0, 7))).toTensor(),
-              (std::move(peek(stack, 1, 7))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 2, 7))),
-              (std::move(peek(stack, 3, 7))).toIntVector(),
-              (std::move(peek(stack, 4, 7))).toIntVector(),
-              (std::move(peek(stack, 5, 7))).toIntVector(),
-              (std::move(peek(stack, 6, 7))).toInt());
-          drop(stack, 7);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
     Operator(
       "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       [] (const Node* node) ->Operation {
-        return [] (Stack& stack) {
-          auto result = AtenIpexCPUDev::dil_convolution_relu(
-              (std::move(peek(stack, 0, 7))).toTensor(),
-              (std::move(peek(stack, 1, 7))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 2, 7))),
-              (std::move(peek(stack, 3, 7))).toIntVector(),
-              (std::move(peek(stack, 4, 7))).toIntVector(),
-              (std::move(peek(stack, 5, 7))).toIntVector(),
-              (std::move(peek(stack, 6, 7))).toInt());
-          drop(stack, 7);
-          pack(stack, std::move(result));
-          return 0;
-        };
+        if (torch_ipex::check_auto_dnnl()) {
+          return [] (Stack& stack) {
+            auto result = AtenIpexJITDev::dil_convolution_relu(
+                (std::move(peek(stack, 0, 7))).toTensor(),
+                (std::move(peek(stack, 1, 7))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 2, 7))),
+                (std::move(peek(stack, 3, 7))).toIntVector(),
+                (std::move(peek(stack, 4, 7))).toIntVector(),
+                (std::move(peek(stack, 5, 7))).toIntVector(),
+                (std::move(peek(stack, 6, 7))).toInt());
+            drop(stack, 7);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        } else {
+          TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now")
+        }
       },
       aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
-      [] (const Node* node) ->Operation {
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_batch_norm(
-              (std::move(peek(stack, 0, 9))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 1, 9))),
-              toOptionalTensor(std::move(peek(stack, 2, 9))),
-              toOptionalTensor(std::move(peek(stack, 3, 9))),
-              toOptionalTensor(std::move(peek(stack, 4, 9))),
-              (std::move(peek(stack, 5, 9))).toBool(),
-              (std::move(peek(stack, 6, 9))).toDouble(),
-              (std::move(peek(stack, 7, 9))).toDouble(),
-              (std::move(peek(stack, 8, 9))).toBool());
-          */
-          auto result = at::Tensor();
-          drop(stack, 9);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::fold_weight(Tensor weight, Tensor? bn_weight, Tensor? running_var, float eps) -> Tensor",
-      [] (const Node* node) -> Operation {
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_fold_weight(
-              (std::move(peek(stack, 0, 4))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 1, 4))),
-              toOptionalTensor(std::move(peek(stack, 2, 4))),
-              (std::move(peek(stack, 3, 4))).toDouble());
-          */
-          auto result = at::Tensor();
-          drop(stack, 4);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::fold_bias(Tensor weight, Tensor? bias, Tensor? bn_weight, Tensor? bn_bias, Tensor? running_mean, Tensor? running_var, float eps) -> Tensor",
-      [] (const Node* node) -> Operation{
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_fold_bias(
-              (std::move(peek(stack, 0, 7))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 1, 7))),
-              toOptionalTensor(std::move(peek(stack, 2, 7))),
-              toOptionalTensor(std::move(peek(stack, 3, 7))),
-              toOptionalTensor(std::move(peek(stack, 4, 7))),
-              toOptionalTensor(std::move(peek(stack, 5, 7))),
-              (std::move(peek(stack, 6, 7))).toDouble());
-          */
-          auto result = at::Tensor();
-          drop(stack, 7);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::sum(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor",
-      [] (const Node* node) ->Operation {
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_sum(
-              (std::move(peek(stack, 0, 3))).toTensor(),
-              (std::move(peek(stack, 1, 3))).toTensor(),
-              (std::move(peek(stack, 2, 3))).toScalar()
-          );
-          */
-          auto result = at::Tensor();
-          drop(stack, 3);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::sum_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)",
-      [] (const Node* node) ->Operation{
-        return [](Stack &stack) {
-          auto self = (std::move(peek(stack, 0, 3))).toTensor();
-          /*
-          auto result = dnnl_sum_(
-              self,
-              (std::move(peek(stack, 1, 3))).toTensor(),
-              (std::move(peek(stack, 2, 3))).toScalar());
-          */
-          auto result = at::Tensor();
-          drop(stack, 3);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
+      )
     /*
     Operator(
       "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)",
       [] (const Node* node) ->Operation {
         return [] (Stack& stack) {
           auto output = (std::move(peek(stack, 7, 9))).toTensor();
-          auto result = dnnl_conv2d_sum(
+          auto result = AtenIpexCPUDev::conv2d_sum(
               (std::move(peek(stack, 0, 9))).toTensor(),
               (std::move(peek(stack, 1, 9))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 2, 9))),
@@ -238,7 +77,7 @@ RegisterOperators op({
       [] (const Node* node) ->Operation {
         return [] (Stack& stack) {
           auto output = (std::move(peek(stack, 7, 9))).toTensor();
-          auto result = dnnl_conv2d_sum_relu(
+          auto result = AtenIpexCPUDev::conv2d_sum_relu(
               (std::move(peek(stack, 0, 9))).toTensor(),
               (std::move(peek(stack, 1, 9))).toTensor(),
               toOptionalTensor(std::move(peek(stack, 2, 9))),
@@ -257,47 +96,6 @@ RegisterOperators op({
       },
       aliasAnalysisFromSchema()
       ),*/
-    Operator(
-      "dnnl::pooling_max_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=0) -> Tensor(a!)",
-      [] (const Node *node) ->Operation {
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_pooling_max_2d(
-              (std::move(peek(stack, 0, 6))).toTensor(),      // Input tensor
-              (std::move(peek(stack, 1, 6))).toIntVector(),  // Kernel size
-              (std::move(peek(stack, 2, 6))).toIntVector(),  // Stride
-              (std::move(peek(stack, 3, 6))).toIntVector(),  // Padding
-              (std::move(peek(stack, 4, 6))).toIntVector(),  // Dilation
-              (std::move(peek(stack, 5, 6))).toBool());       // Ceil mode
-          */
-          auto result = at::Tensor();
-          drop(stack, 6);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
-    Operator(
-      "dnnl::pooling_avg_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, bool ceil_mode=0, bool count_include_pad=True, int? divisor_override=None) -> Tensor(a!)",
-      [] (const Node *node) ->Operation {
-        return [] (Stack& stack) {
-          /*
-          auto result = dnnl_pooling_avg_2d(
-              (std::move(peek(stack, 0, 7))).toTensor(),      // Input tensor
-              (std::move(peek(stack, 1, 7))).toIntVector(),  // Kernel size
-              (std::move(peek(stack, 2, 7))).toIntVector(),  // Stride
-              (std::move(peek(stack, 3, 7))).toIntVector(),  // Padding
-              (std::move(peek(stack, 4, 7))).toBool());       // Ceil mode
-          */
-          auto result = at::Tensor();
-          drop(stack, 7);
-          pack(stack, std::move(result));
-          return 0;
-        };
-      },
-      aliasAnalysisFromSchema()
-      ),
     });
 }
 }

From c08064abfa10dc617d15dbee4e536f4dd4cb9c98 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Wed, 20 May 2020 14:45:05 +0800
Subject: [PATCH 03/10] jit: enable conv_sum and conc_sum_relu fusion

---
 tests/cpu/test_jit.py                         | 101 ++++++++++++++----
 torch_ipex/csrc/cpu/FusionOPs.cpp             |  72 +++++++++++++
 torch_ipex/csrc/cpu/FusionOPs.h               |  15 +--
 torch_ipex/csrc/cpu/dbl/Common.cpp            |   2 -
 torch_ipex/csrc/cpu/dbl/Conv.cpp              |  61 +++++++++++
 torch_ipex/csrc/cpu/dbl/Conv.h                |  11 ++
 torch_ipex/csrc/jit/fusion_pass.cpp           |  18 ++--
 torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp |  91 ++++++++--------
 8 files changed, 290 insertions(+), 81 deletions(-)

diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
index 42142cdc8..f585da98d 100644
--- a/tests/cpu/test_jit.py
+++ b/tests/cpu/test_jit.py
@@ -56,6 +56,7 @@
 
 import torch
 import torch.nn as nn
+from torch.jit._recursive import wrap_cpp_module
 import copy
 
 import intel_pytorch_extension
@@ -82,29 +83,89 @@
 torch._C._jit_set_profiling_mode(False)
 torch._C._jit_set_profiling_executor(False)
 
-class Conv_relu(nn.Module):
-    def __init__(self):
-        super(Conv_relu, self).__init__()
+def test_output(model, x):
+    modelName = model.__class__.__name__
+    core.disable_jit()
+
+    model = model.to('dpcpp').eval()
+    x = x.to('dpcpp')
+    with torch.no_grad():
+        result = model(x)
+
+    smodel = torch.jit.script(model)
+    smodel.eval()
+    with torch.no_grad():
+        sresult = smodel(x)
+
+    print(f'\nAre {modelName} and Scripted{modelName} outputs the same: ',
+          torch.allclose(
+              sresult, result, rtol=1e-05, atol=1e-06, equal_nan=False))
+
+    core.enable_jit()
+    pmodel = torch.jit.script(model)
+    # bn folding
+    pmodel = wrap_cpp_module(torch._C._jit_pass_fold_convbn(pmodel._c))
+    with torch.no_grad():
+        # conv relu fusion, conv sum fusion or conv sum relu fusion
+        print(pmodel.graph_for(x))
+        presult = pmodel(x)
+
+    # print(result)
+    # print(sresult)
+    # print(presult)
+
+    print(f'\nWith or without pyrys, are Scripted{modelName} outputs the same: ',
+          torch.allclose(
+                sresult, presult, rtol=1e-05, atol=1e-06, equal_nan=False))
+
+class Conv2dRelu_Fixed(nn.Module):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(Conv2dRelu_Fixed, self).__init__()
+        seed = 2018
+        torch.manual_seed(seed)
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+
+    def forward(self, x):
+        return F.relu(self.conv(x), inplace=True)
+
+class CascadedConv2dBnSumRelu(nn.Module):
+    def __init__(self, in_channels, mid_channels, out_channels, **kwargs):
+        super(CascadedConv2dBnSumRelu, self).__init__()
         torch.manual_seed(2018)
-        self.conv = torch.nn.Conv2d(20, 20, 5)
+        self.conv = nn.Conv2d(in_channels, mid_channels, bias=False, **kwargs)
+        self.conv1 = nn.Conv2d(
+            mid_channels, out_channels, bias=False, padding=1, **kwargs)
+        self.conv2 = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm2d(mid_channels, eps=0.001)
+        self.bn1 = nn.BatchNorm2d(out_channels, eps=0.001)
+        self.bn2 = nn.BatchNorm2d(out_channels, eps=0.001)
 
     def forward(self, x):
-        x = self.conv(x)
-        return x.relu()
-
-class TestJITOP(TestCase):
-    def test_conv_relu_fusion(self):
-        x = torch.randn(1, 20, 20, 20).to('dpcpp')
-
-        model = Conv_relu().to('dpcpp').eval()
-
-        with torch.no_grad():
-            core.disable_jit()
-            y1 = model(x)
-            core.enable_jit()
-            script_model =  torch.jit.script(model)
-            y2 = script_model(x)
-        self.assertEqual(y1, y2)
+        a = self.conv(x)
+        a = self.bn(a)
+        a = F.relu(a, inplace=True)
+        a = self.conv1(a)
+        a = self.bn1(a)
+        b = self.conv2(x)
+        b = self.bn2(b)
+        return F.relu(a.add_(b), inplace=True)
+
+class Tester(TestCase):
+    n = 32
+    c = 3
+    h = 224
+    w = 224
+    print('input size: (%d, %d, %d, %d)' % (n, c, h, w))
+
+    def test_output_conv_relu(self):
+        test_output(
+            Conv2dRelu_Fixed(self.c, 32, kernel_size=3, stride=1),
+            torch.rand(self.n, self.c, self.h, self.w))
+
+    def test_output_cascaded_conv2d_bn_sum_relu(self):
+        test_output(
+            CascadedConv2dBnSumRelu(self.c, 64, 32, kernel_size=3, stride=1),
+            torch.rand(self.n, self.c, self.h, self.w))
 
 if __name__ == '__main__':
     core.enable_auto_dnnl()
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
index aa06f5105..c87c0940e 100644
--- a/torch_ipex/csrc/cpu/FusionOPs.cpp
+++ b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -55,5 +55,77 @@ at::Tensor AtenIpexJITDev::dil_convolution_relu(
   return dbl::comm::gen_aten_tensor_by(dil_output);
 }
 
+static at::Tensor& dil_convolution_inplace_fusion(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    at::Tensor& accumu, 
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    int64_t groups,
+    const dil::attr_t& attr) {
+  dil::tensor dil_input;
+  dil::tensor dil_weight;
+  dil::tensor dil_output;
+  c10::optional<dil::tensor> dil_bias{c10::nullopt};
+
+  auto input_contiguous = input.contiguous();
+  auto weight_contiguous = weight.contiguous();
+  auto output_contiguous = accumu.contiguous();
+
+  dil_input = dbl::comm::try_gen_dil_tensor(input_contiguous);
+  dil_weight = dbl::comm::try_gen_dil_tensor(weight_contiguous);
+  dil_output = dbl::comm::try_gen_dil_tensor(output_contiguous);
+  if (bias.defined()) {
+    auto bias_contiguous = bias.contiguous();
+    dil_bias = dbl::comm::try_gen_dil_tensor(bias_contiguous);
+  }
+
+  dbl::conv::conv2d_inplace_impl(
+    dil_input,
+    dil_weight,
+    dil_bias,
+    dil_output,
+    padding,
+    stride,
+    dilation,
+    groups,
+    attr);
+
+  dbl::comm::sync_shape_from_dil_to_aten(accumu, dil_output);
+  return accumu;
+}
+
+at::Tensor& AtenIpexJITDev::dil_convolution_sum(
+    const at::Tensor & input,
+    const at::Tensor & weight,
+    const at::Tensor & bias,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    int64_t groups,
+    at::Tensor& accumu,
+    at::Scalar alpha) {
+  auto scale = alpha.to<float>();
+  return dil_convolution_inplace_fusion(input, weight, bias, accumu, stride, padding,
+      dilation, groups, dil::attr_t::fuse_sum(scale));
+}
+
+at::Tensor& AtenIpexJITDev::dil_convolution_sum_relu(
+    const at::Tensor & input,
+    const at::Tensor & weight,
+    const at::Tensor & bias,
+    at::IntArrayRef stride,
+    at::IntArrayRef padding,
+    at::IntArrayRef dilation,
+    int64_t groups,
+    at::Tensor& accumu,
+    at::Scalar alpha) {
+  auto scale = alpha.to<float>();
+  return dil_convolution_inplace_fusion(input, weight, bias, accumu, stride, padding,
+      dilation, groups, dil::attr_t::residual(scale));
+}
+
 }  // namespace cpu
 }  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/FusionOPs.h b/torch_ipex/csrc/cpu/FusionOPs.h
index 14f3db7e1..dcab1ea66 100644
--- a/torch_ipex/csrc/cpu/FusionOPs.h
+++ b/torch_ipex/csrc/cpu/FusionOPs.h
@@ -11,12 +11,11 @@ namespace torch { namespace jit {
 // XXX: PyTorch does not support nesting namespace
 // And the alias analysis is not working for namespace other than aten ...
 // So we fake some op namespaces to workaround that.
-namespace dnnl {
-  static auto conv2d_relu = Symbol::fromQualString("dnnl::conv2d_relu");
-  static auto conv2d_sum = Symbol::fromQualString("dnnl::conv2d_sum");
-  static auto conv2d_relu_sum = Symbol::fromQualString("dnnl::conv2d_relu_sum");
-  static auto conv2d_sum_relu = Symbol::fromQualString("dnnl::conv2d_sum_relu");
-
+namespace ipex {
+  static auto conv2d_relu = Symbol::fromQualString("ipex::conv2d_relu");
+  static auto conv2d_sum = Symbol::fromQualString("ipex::conv2d_sum");
+  static auto conv2d_relu_sum = Symbol::fromQualString("ipex::conv2d_relu_sum");
+  static auto conv2d_sum_relu = Symbol::fromQualString("ipex::conv2d_sum_relu");
 }
 
 }} // namespace torch::jit
@@ -29,6 +28,10 @@ class AtenIpexJITDev {
   // for JIT ops
   static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
 
+  static at::Tensor& dil_convolution_sum(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor& accumu, at::Scalar alpha);
+
+  static at::Tensor& dil_convolution_sum_relu( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor& accumu, at::Scalar alpha);
+
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp
index 3be05955d..913643ce3 100644
--- a/torch_ipex/csrc/cpu/dbl/Common.cpp
+++ b/torch_ipex/csrc/cpu/dbl/Common.cpp
@@ -91,7 +91,6 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso
   dil::dims sizes = dil_tensor.get_dims();
   if (dil_tensor.is_public_format()) {
     dil::dims strides = dil_tensor.get_strides();
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.device().type() == at::DeviceType::DPCPP);
     auto* _tensor_impl = (IPEXTensorImpl *)ipex_tensor.unsafeGetTensorImpl();
     _tensor_impl->force_set_strided(sizes, strides);
   } else {
@@ -99,7 +98,6 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.size() != 1 || sizes[0] != 0);
     ipex_tensor.unsafeGetTensorImpl()->set_sizes_contiguous(sizes);
   }
-
 }
 
 }  // namespace comm
diff --git a/torch_ipex/csrc/cpu/dbl/Conv.cpp b/torch_ipex/csrc/cpu/dbl/Conv.cpp
index 9dadfdebd..c3fce71ca 100644
--- a/torch_ipex/csrc/cpu/dbl/Conv.cpp
+++ b/torch_ipex/csrc/cpu/dbl/Conv.cpp
@@ -86,6 +86,67 @@ dil::tensor conv2d_impl(
   return y;
 }
 
+void conv2d_inplace_impl(
+    const dil::tensor& x,
+    const dil::tensor& w,
+    const c10::optional<dil::tensor>& b,
+    dil::tensor& y,
+    at::IntArrayRef padding,
+    at::IntArrayRef stride,
+    at::IntArrayRef dilation,
+    int64_t groups,
+    const dil::attr_t& attr) {
+  std::vector<int64_t> kernel_size(x.ndims());
+  // mkldnn conv2d weights could have been re-ordered to 5d by
+  // mkldnn_reorder_conv2d_weight
+  if (w.ndims() == x.ndims() + 1) {
+    AT_ASSERTM(
+      groups > 1,
+      "Only group _mkldnn_conv2d weights could have been reordered to 5d");
+    kernel_size[0] = w.get_dim(0) * w.get_dim(1);
+    std::copy_n(w.get_dims().cbegin() + 2, x.ndims() - 1, kernel_size.begin() + 1);
+  } else {
+    std::copy_n(w.get_dims().cbegin(), x.ndims(), kernel_size.begin());
+  }
+
+  const dil::dims x_dims = x.get_dims();
+  std::vector<int64_t> input_size{x_dims.cbegin(), x_dims.cend()};
+  std::vector<int64_t> output_sizes = calc_conv_output_size(input_size, kernel_size, padding, stride, dilation);
+
+  if (b.has_value()) {
+    dil::convolution_forward::compute(
+      x,
+      w,
+      b.value(),
+      {output_sizes.cbegin(), output_sizes.cend()},
+      y,
+      {stride.begin(), stride.end()},
+      {dilation.begin(), dilation.end()},
+      {padding.begin(), padding.end()},
+      {padding.begin(), padding.end()},
+      groups,
+      dil::scale_t(),
+      dil::scale_t(),
+      dil::scale_t(),
+      attr);
+  } else {
+    dil::convolution_forward::compute(
+      x,
+      w,
+      {output_sizes.cbegin(), output_sizes.cend()},
+      y,
+      {stride.begin(), stride.end()},
+      {dilation.begin(), dilation.end()},
+      {padding.begin(), padding.end()},
+      {padding.begin(), padding.end()},
+      groups,
+      dil::scale_t(),
+      dil::scale_t(),
+      dil::scale_t(),
+      attr);
+  }
+}
+
 }  // namespace conv
 }  // namespace dbl
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/dbl/Conv.h b/torch_ipex/csrc/cpu/dbl/Conv.h
index e4d41aa33..5f954f330 100644
--- a/torch_ipex/csrc/cpu/dbl/Conv.h
+++ b/torch_ipex/csrc/cpu/dbl/Conv.h
@@ -28,6 +28,17 @@ dil::tensor conv2d_impl(
     int64_t groups,
     const dil::attr_t& attr = dil::attr_t());
 
+void conv2d_inplace_impl(
+    const dil::tensor& x,
+    const dil::tensor& w,
+    const c10::optional<dil::tensor>& b,
+    dil::tensor& y,
+    at::IntArrayRef padding,
+    at::IntArrayRef stride,
+    at::IntArrayRef dilation,
+    int64_t groups,
+    const dil::attr_t& attr = dil::attr_t());
+
 }  // namespace conv
 }  // namespace dbl
 }  // namespace cpu
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
index 9568e1015..2661c7844 100644
--- a/torch_ipex/csrc/jit/fusion_pass.cpp
+++ b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -275,16 +275,14 @@ class OpFuser {
 
 // TODO: These rules should be more scalable
 OpFuser::RuleTab OpFuser::dnnlRules = {
-  {{aten::conv2d, aten::relu}, dnnl::conv2d_relu},
-  {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, dnnl::conv2d_relu},
-  /*
-  {{AtenIpexCPUDev::conv2d_sum, AtenIpexCPUDev::relu}, AtenIpexCPUDev::conv2d_sum_relu},
-  {{AtenIpexCPUDev::conv2d_sum, dnnl::relu_}, AtenIpexCPUDev::conv2d_sum_relu},
-
-  {{aten::conv2d, aten::add}, AtenIpexCPUDev::conv2d_sum},
-  {{aten::conv2d, aten::add_}, AtenIpexCPUDev::conv2d_sum},
-  {{AtenIpexCPUDev::conv2d_relu, aten::add}, AtenIpexCPUDev::conv2d_relu_sum}
-  */
+  {{aten::conv2d, aten::relu}, ipex::conv2d_relu},
+  {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, ipex::conv2d_relu},
+  {{ipex::conv2d_sum, aten::relu}, ipex::conv2d_sum_relu},
+  {{ipex::conv2d_sum, Symbol::fromQualString("aten::relu_")}, ipex::conv2d_sum_relu},
+
+  {{aten::conv2d, aten::add}, ipex::conv2d_sum},
+  {{aten::conv2d, aten::add_}, ipex::conv2d_sum},
+  //{{dnnl::conv2d_relu, aten::add}, dnnl::conv2d_relu_sum}
 };
 
 void FusionPass(std::shared_ptr<Graph> &graph) {
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
index 26760b9df..bcfe69c27 100644
--- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
+++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -25,7 +25,7 @@ using namespace torch_ipex::cpu;
 
 RegisterOperators op({
     Operator(
-      "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
+      "ipex::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       [] (const Node* node) ->Operation {
         if (torch_ipex::check_auto_dnnl()) {
           return [] (Stack& stack) {
@@ -42,60 +42,65 @@ RegisterOperators op({
             return 0;
           };
         } else {
-          TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now")
+          TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now");
         }
       },
       aliasAnalysisFromSchema()
-      )
-    /*
+      ),
     Operator(
-      "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)",
+      "ipex::conv2d_sum(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups, Tensor(a!) accumu, *, Scalar alpha) -> Tensor(a!)",
       [] (const Node* node) ->Operation {
-        return [] (Stack& stack) {
-          auto output = (std::move(peek(stack, 7, 9))).toTensor();
-          auto result = AtenIpexCPUDev::conv2d_sum(
-              (std::move(peek(stack, 0, 9))).toTensor(),
-              (std::move(peek(stack, 1, 9))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 2, 9))),
-              (std::move(peek(stack, 3, 9))).toIntVector(),
-              (std::move(peek(stack, 4, 9))).toIntVector(),
-              (std::move(peek(stack, 5, 9))).toIntVector(),
-              (std::move(peek(stack, 6, 9))).toInt(),
-              output,
-              (std::move(peek(stack, 8, 9))).toScalar()
-          );
-          auto result = at::Tensor();
-          drop(stack, 9);
-          pack(stack, std::move(result));
-          return 0;
-        };
+        if (torch_ipex::check_auto_dnnl()) {
+          return [] (Stack& stack) {
+            auto output = (std::move(peek(stack, 7, 9))).toTensor();
+            auto result = AtenIpexJITDev::dil_convolution_sum(
+                (std::move(peek(stack, 0, 9))).toTensor(),
+                (std::move(peek(stack, 1, 9))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 2, 9))),
+                (std::move(peek(stack, 3, 9))).toIntVector(),
+                (std::move(peek(stack, 4, 9))).toIntVector(),
+                (std::move(peek(stack, 5, 9))).toIntVector(),
+                (std::move(peek(stack, 6, 9))).toInt(),
+                output,
+                (std::move(peek(stack, 8, 9))).toScalar()
+            );
+            drop(stack, 9);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        } else {
+          TORCH_CHECK(false, "PyTorch native path not support convolution sum fusion now");
+        }
       },
       aliasAnalysisFromSchema()
       ),
     Operator(
-      "dnnl::conv2d_sum_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)",
+      "ipex::conv2d_sum_relu(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups, Tensor(a!) accumu, *, Scalar alpha) -> Tensor(a!)",
       [] (const Node* node) ->Operation {
-        return [] (Stack& stack) {
-          auto output = (std::move(peek(stack, 7, 9))).toTensor();
-          auto result = AtenIpexCPUDev::conv2d_sum_relu(
-              (std::move(peek(stack, 0, 9))).toTensor(),
-              (std::move(peek(stack, 1, 9))).toTensor(),
-              toOptionalTensor(std::move(peek(stack, 2, 9))),
-              (std::move(peek(stack, 3, 9))).toIntVector(),
-              (std::move(peek(stack, 4, 9))).toIntVector(),
-              (std::move(peek(stack, 5, 9))).toIntVector(),
-              (std::move(peek(stack, 6, 9))).toInt(),
-              output,
-              (std::move(peek(stack, 8, 9))).toScalar()
-          );
-          auto result = at::Tensor();
-          drop(stack, 9);
-          pack(stack, std::move(result));
-          return 0;
-        };
+        if (torch_ipex::check_auto_dnnl()) {
+          return [] (Stack& stack) {
+            auto output = (std::move(peek(stack, 7, 9))).toTensor();
+            auto result = AtenIpexJITDev::dil_convolution_sum_relu(
+                (std::move(peek(stack, 0, 9))).toTensor(),
+                (std::move(peek(stack, 1, 9))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 2, 9))),
+                (std::move(peek(stack, 3, 9))).toIntVector(),
+                (std::move(peek(stack, 4, 9))).toIntVector(),
+                (std::move(peek(stack, 5, 9))).toIntVector(),
+                (std::move(peek(stack, 6, 9))).toInt(),
+                output,
+                (std::move(peek(stack, 8, 9))).toScalar()
+            );
+            drop(stack, 9);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        } else {
+          TORCH_CHECK(false, "PyTorch native path not support convolution sum relu fusion now");
+        }
       },
       aliasAnalysisFromSchema()
-      ),*/
+      )
     });
 }
 }

From e7ee4b389edc2bb02860c08ae4967bbc3ba77427 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Mon, 25 May 2020 18:55:44 +0800
Subject: [PATCH 04/10] make rewrited linear op can be traced

---
 intel_pytorch_extension_py/ops/linear.py | 27 +-----------
 torch_ipex/csrc/cpu/CustomerOps.h        | 55 ++++++++++++++++++++++++
 torch_ipex/csrc/cpu/DevOPs.cpp           | 11 +++--
 torch_ipex/csrc/cpu/DevOPs.h             |  4 +-
 torch_ipex/csrc/cpu/ExtendOPs.cpp        |  6 ++-
 torch_ipex/csrc/cpu/ExtendOPs.h          |  2 +-
 torch_ipex/csrc/cpu/FusionOPs.cpp        |  2 +-
 torch_ipex/csrc/cpu/RegisterOps.cpp      | 11 +++++
 torch_ipex/csrc/cpu/dbl/Common.cpp       |  1 +
 torch_ipex/csrc/init_python_bindings.cpp |  2 +-
 10 files changed, 83 insertions(+), 38 deletions(-)
 create mode 100644 torch_ipex/csrc/cpu/CustomerOps.h
 create mode 100644 torch_ipex/csrc/cpu/RegisterOps.cpp

diff --git a/intel_pytorch_extension_py/ops/linear.py b/intel_pytorch_extension_py/ops/linear.py
index 05a90b23b..8ec9e76c9 100644
--- a/intel_pytorch_extension_py/ops/linear.py
+++ b/intel_pytorch_extension_py/ops/linear.py
@@ -3,29 +3,4 @@
 import torch.nn.functional as F
 import _torch_ipex as core
 
-F_linear = F.linear
-
-class LinearFunction(Function):
-    @staticmethod
-    def forward(ctx, input, weight, bias):
-        output = core.linear(input, weight, bias)
-        ctx.save_for_backward(input, weight, bias)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, weight, bias = ctx.saved_tensors
-        grad_output = grad_output.contiguous()
-        if bias == None:
-            output_mask = (input.requires_grad, weight.requires_grad, 0)
-        else:
-            output_mask = (input.requires_grad, weight.requires_grad, bias.requires_grad)
-        grad_input, grad_weight, grad_bias = core.linear_backward(input, grad_output, weight, output_mask)
-        return (grad_input, grad_weight, grad_bias)
-
-def linear(input, weight, bias=None):
-    if input.device.type == 'dpcpp' and core.get_auto_dnnl():
-        return LinearFunction.apply(input, weight, bias)
-    return F_linear(input, weight, bias)
-
-F.linear = linear
+F.linear = torch.ops.torch_ipex.linear
diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomerOps.h
new file mode 100644
index 000000000..ea108ec4e
--- /dev/null
+++ b/torch_ipex/csrc/cpu/CustomerOps.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function.h>
+#include <ATen/Tensor.h>
+#include <torch/script.h>
+#include <c10/util/Optional.h>
+#include "torch_ipex/csrc/utils.h"
+#include "DevOPs.h"
+
+using namespace at;
+
+class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
+  public:
+      static at::Tensor forward(
+        torch::autograd::AutogradContext* ctx,
+        at::Tensor input,
+        at::Tensor weight,
+        at::Tensor bias) {
+        ctx->save_for_backward({input, weight, bias});
+        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+          return torch_ipex::cpu::AtenIpexCPUDev::dil_linear(input, weight, bias);
+        } else {
+          return at::linear(input, weight, bias);
+        }
+      }
+
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs) {
+      auto saved = ctx->get_saved_variables();
+      at::Tensor input = saved[0];
+      at::Tensor weight = saved[1];
+      at::Tensor bias = saved[2];
+
+      at::Tensor grad_output = grad_outputs[0];
+      at::Tensor grad_input, grad_weight;
+      at::Tensor grad_bias = torch::Tensor();
+ 
+      if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+        grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_input(
+            input.sizes(), grad_output, weight);
+        std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights(
+            grad_output, input, weight, bias.defined());
+      } else {
+        auto grad_input = grad_output.mm(weight);
+        auto grad_weight = grad_output.t().mm(input);
+        if (bias.defined()) {
+          grad_bias = grad_output.sum(0);
+        }
+      }
+      return {grad_input, grad_weight, grad_bias};
+    }
+};
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
index ebe231f42..9f405ed02 100644
--- a/torch_ipex/csrc/cpu/DevOPs.cpp
+++ b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -526,7 +526,7 @@ at::Tensor& AtenIpexCPUDev::dil_addbmm_(
 at::Tensor AtenIpexCPUDev::dil_linear(
     const at::Tensor& self,
     const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias) {
+    const at::Tensor& bias) {
   DEBUG("AtenIpexCPUDev::dil_linear\n");
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(weight);
@@ -539,9 +539,8 @@ at::Tensor AtenIpexCPUDev::dil_linear(
   const dil::tensor w = dbl::comm::try_gen_dil_tensor(weight);
 
   dil::tensor y;
-  if (bias.has_value()) {
-    at::Tensor bias_vec = bias.value();
-    const dil::tensor b = dbl::comm::try_gen_dil_tensor(bias_vec);
+  if (bias.defined()) {
+    const dil::tensor b = dbl::comm::try_gen_dil_tensor(bias);
     dil::inner_product_forward::compute(x, w, b, y);
   } else {
     dil::inner_product_forward::compute(x, w, y);
@@ -599,7 +598,7 @@ at::Tensor AtenIpexCPUDev::dil_linear_fuse_relu(
   return dbl::comm::gen_aten_tensor_by(std::move(y));
 }
 
-at::Tensor dil_linear_backward_input(
+at::Tensor AtenIpexCPUDev::dil_linear_backward_input(
     at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight){
   DEBUG("AtenIpexCPUDev::dil_linear_backward_input\n");
   auto grad_output_reshaped = grad_output.dim() > 2 ?
@@ -621,7 +620,7 @@ at::Tensor dil_linear_backward_input(
   return dbl::comm::gen_aten_tensor_by(std::move(gradx));
 }
 
-std::tuple<at::Tensor, at::Tensor> dil_linear_backward_weights(
+std::tuple<at::Tensor, at::Tensor> AtenIpexCPUDev::dil_linear_backward_weights(
     const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, bool bias_defined) {
   DEBUG("AtenIpexCPUDev::dil_linear_backward_weights\n");
   auto grad_output_reshaped = grad_output.dim() > 2 ?
diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h
index 941bf93a8..49c47a199 100644
--- a/torch_ipex/csrc/cpu/DevOPs.h
+++ b/torch_ipex/csrc/cpu/DevOPs.h
@@ -38,8 +38,10 @@ class AtenIpexCPUDev {
   static at::Tensor dil_addbmm(const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha);
   static at::Tensor& dil_addbmm_(at::Tensor& self, const at::Tensor& batch1, const at::Tensor& batch2, at::Scalar beta, at::Scalar alpha);
   static at::Tensor& dil_addbmm_out(at::Tensor& result, const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha);
-  static at::Tensor dil_linear(const at::Tensor& self, const at::Tensor& weight, const c10::optional<at::Tensor>& bias);
   static at::Tensor dil_linear_fuse_relu(const at::Tensor& self, const at::Tensor& weight, const c10::optional<at::Tensor>& bias);
+  static at::Tensor dil_linear(const at::Tensor& self, const at::Tensor& weight, const at::Tensor& bias);
+  static at::Tensor dil_linear_backward_input(at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight);
+  static std::tuple<at::Tensor, at::Tensor> dil_linear_backward_weights(const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, bool bias_defined);
   static std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_linear_backward(const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, std::array<bool,3> output_mask);
   static at::Tensor dil_dropout(const at::Tensor& self, double ratio, bool train);
   static at::Tensor dil_dropout_backward(const at::Tensor& grady, const at::Tensor& mask, double ratio);
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp
index bb11a869f..80d337cc3 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.cpp
+++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp
@@ -10,6 +10,7 @@
 #include "xsmm/libxsmm_utils.h"
 #include "../utils.h"
 #include "DevOPs.h"
+#include "CustomerOps.h"
 
 namespace torch_ipex {
 
@@ -449,8 +450,9 @@ AtenIpexTypeExt::embedding_bag_backward(const at::Tensor& grad, const at::Tensor
   return cpu::aten::embedding_bag::embedding_bag_backward_impl(grad, indices, offsets, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, sparse, _per_sample_weights);
 }
 
-at::Tensor AtenIpexTypeExt::linear(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) {
-    return cpu::AtenIpexCPUDev::dil_linear(input, weight, bias);
+
+at::Tensor AtenIpexTypeExt::linear(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) {
+    return NewLinearOp::apply(input, weight, bias);
 }
 
 at::Tensor AtenIpexTypeExt::linear_fuse_relu(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) {
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.h b/torch_ipex/csrc/cpu/ExtendOPs.h
index 9305e454b..dedc3e2a4 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.h
+++ b/torch_ipex/csrc/cpu/ExtendOPs.h
@@ -23,8 +23,8 @@ class AtenIpexTypeExt {
       int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse,
       const c10::optional<at::Tensor>& per_sample_weights);
 
-  static at::Tensor linear(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias);
   static at::Tensor linear_fuse_relu(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias);
+  static at::Tensor linear(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias = at::Tensor());
   static std::tuple<at::Tensor, at::Tensor, at::Tensor> linear_backward(const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, std::array<bool,3> output_mask);
   static at::Tensor relu_use_dst_for_bwd(const at::Tensor& grad_output, const at::Tensor& output);
   static at::Tensor adaptive_avg_pool2d(at::Tensor const& input, at::IntArrayRef output_size);
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
index c87c0940e..2e3e1b9c5 100644
--- a/torch_ipex/csrc/cpu/FusionOPs.cpp
+++ b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -59,7 +59,7 @@ static at::Tensor& dil_convolution_inplace_fusion(
     const at::Tensor& input,
     const at::Tensor& weight,
     const at::Tensor& bias,
-    at::Tensor& accumu, 
+    at::Tensor& accumu,
     at::IntArrayRef stride,
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp
new file mode 100644
index 000000000..16e017c8b
--- /dev/null
+++ b/torch_ipex/csrc/cpu/RegisterOps.cpp
@@ -0,0 +1,11 @@
+#include <torch/script.h>
+#include "ExtendOPs.h"
+
+static auto registry =
+    torch::RegisterOperators()
+       .op("torch_ipex::linear",
+          [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) {
+          return torch_ipex::AtenIpexTypeExt::linear(input, weight, bias);
+        });
+
+
diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp
index 913643ce3..13cabe94e 100644
--- a/torch_ipex/csrc/cpu/dbl/Common.cpp
+++ b/torch_ipex/csrc/cpu/dbl/Common.cpp
@@ -91,6 +91,7 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso
   dil::dims sizes = dil_tensor.get_dims();
   if (dil_tensor.is_public_format()) {
     dil::dims strides = dil_tensor.get_strides();
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.device().type() == at::DeviceType::DPCPP);
     auto* _tensor_impl = (IPEXTensorImpl *)ipex_tensor.unsafeGetTensorImpl();
     _tensor_impl->force_set_strided(sizes, strides);
   } else {
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
index 33492bf09..33066cd5a 100644
--- a/torch_ipex/csrc/init_python_bindings.cpp
+++ b/torch_ipex/csrc/init_python_bindings.cpp
@@ -94,7 +94,7 @@ void InitIpexModuleBindings(py::module m) {
         });
 
   m.def("linear",
-        [](const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) {
+        [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) {
           return AtenIpexTypeExt::linear(input, weight, bias);
         });
   m.def("linear_fuse_relu",

From 85e71702bd11d72e008d25e956bb0ff520911aef Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Tue, 26 May 2020 21:22:17 +0800
Subject: [PATCH 05/10] make rewrited max_pool2d op can be traced

---
 intel_pytorch_extension_py/ops/pooling.py | 18 ++++----
 torch_ipex/csrc/cpu/CustomerOps.h         | 54 +++++++++++++++++++++++
 torch_ipex/csrc/cpu/ExtendOPs.cpp         |  2 +-
 torch_ipex/csrc/cpu/RegisterOps.cpp       |  8 ++--
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py
index 7ff457d56..35710b70f 100644
--- a/intel_pytorch_extension_py/ops/pooling.py
+++ b/intel_pytorch_extension_py/ops/pooling.py
@@ -2,7 +2,10 @@
 from torch.autograd import Function
 import torch.nn.functional as F
 import _torch_ipex as core
-from torch.nn.modules.utils import _single
+from torch.nn.modules.utils import _single, _pair
+from typing import List
+
+Vector = List[int]
 
 torch_adaptive_avg_pool2d = torch._C._nn.adaptive_avg_pool2d
 torch_max_pool2d = torch.max_pool2d
@@ -49,14 +52,6 @@ def adaptive_avg_pool2d(input, output_size):
         pass
     return torch_adaptive_avg_pool2d(input, output_size)
 
-def max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode):
-    try:
-        if input.device.type == 'dpcpp' and core.get_auto_dnnl():
-            return MaxPoolingFunction.apply(input, kernel_size, stride, padding, dilation, ceil_mode)
-    except RuntimeError:
-        pass
-    return torch_max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
-
 def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode):
     try:
         if input.device.type == 'dpcpp' and core.get_auto_dnnl():
@@ -65,6 +60,9 @@ def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode):
         pass
     return torch_max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode)
 
+def max_pool2d(input, kernel_size: Vector, stride: Vector, padding: Vector, dilation: Vector, ceil_mode: bool):
+    return torch.ops.torch_ipex.max_pool2d(input, _pair(kernel_size), _pair(stride), _pair(padding), _pair(dilation), ceil_mode)
+
 torch._C._nn.adaptive_avg_pool2d = adaptive_avg_pool2d
 torch.max_pool2d = max_pool2d
-torch.max_pool3d = max_pool3d
\ No newline at end of file
+torch.max_pool3d = max_pool3d
diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomerOps.h
index ea108ec4e..96a94e7db 100644
--- a/torch_ipex/csrc/cpu/CustomerOps.h
+++ b/torch_ipex/csrc/cpu/CustomerOps.h
@@ -53,3 +53,57 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
       return {grad_input, grad_weight, grad_bias};
     }
 };
+
+class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
+  public:
+      static at::Tensor forward(
+        torch::autograd::AutogradContext* ctx,
+        at::Tensor input,
+        at::IntArrayRef kernel_size,
+        at::IntArrayRef stride,
+        at::IntArrayRef padding,
+        at::IntArrayRef dilation,
+        bool ceil_mode) {
+        ctx->saved_data["kernel_size"] = kernel_size;
+        ctx->saved_data["stride"] = stride;
+        ctx->saved_data["padding"] = padding;
+        ctx->saved_data["dilation"] = dilation;
+        ctx->saved_data["ceil_mode"] = ceil_mode;
+        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+          at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride,
+              padding, dilation, ceil_mode);
+          ctx->save_for_backward({input, output});
+          return output;
+        } else {
+          at::Tensor output, indices;
+          std::tie(output, indices) = at::max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode);
+          ctx->save_for_backward({input, indices});
+          return output;
+        }
+      }
+
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs) {
+      auto saved = ctx->get_saved_variables();
+      at::Tensor input = saved[0];
+      at::Tensor indices = saved[1];
+
+      at::Tensor grad_output = grad_outputs[0];
+      at::Tensor grad_input;
+      at::IntArrayRef kernel_size = at::IntArrayRef(ctx->saved_data["kernel_size"].toIntVector());
+      at::IntArrayRef stride = at::IntArrayRef(ctx->saved_data["stride"].toIntVector());
+      at::IntArrayRef padding = at::IntArrayRef(ctx->saved_data["padding"].toIntVector());
+      at::IntArrayRef dilation = at::IntArrayRef(ctx->saved_data["dilation"].toIntVector());
+      bool ceil_mode = ctx->saved_data["ceil_mode"].toBool();
+
+      if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+        grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling_backward(
+            grad_output, indices, input, kernel_size, stride, padding, dilation, ceil_mode);
+      } else {
+        grad_input = at::max_pool2d_with_indices_backward(grad_output, input, kernel_size,
+            stride, padding, dilation, ceil_mode, indices);
+      }
+      return {grad_input};
+    }
+};
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp
index 80d337cc3..ed87fb9a5 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.cpp
+++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp
@@ -474,7 +474,7 @@ at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d_backward(const at::Tensor& grad_
 }
 
 at::Tensor AtenIpexTypeExt::max_pooling(const at::Tensor& input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
-    return cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride, padding, dilation, ceil_mode);
+    return NewMaxPoolingOp::apply(input, kernel_size, stride, padding, dilation, ceil_mode);
 }
 
 at::Tensor AtenIpexTypeExt::max_pooling_backward(const at::Tensor& grad_output, const at::Tensor& output, const at::Tensor& input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp
index 16e017c8b..f241fe341 100644
--- a/torch_ipex/csrc/cpu/RegisterOps.cpp
+++ b/torch_ipex/csrc/cpu/RegisterOps.cpp
@@ -3,9 +3,9 @@
 
 static auto registry =
     torch::RegisterOperators()
-       .op("torch_ipex::linear",
-          [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) {
-          return torch_ipex::AtenIpexTypeExt::linear(input, weight, bias);
+       .op("torch_ipex::linear", &torch_ipex::AtenIpexTypeExt::linear)
+       .op("torch_ipex::max_pool2d", [](const at::Tensor& self, c10::List<int64_t> kernel_size,
+          c10::List<int64_t> stride, c10::List<int64_t> padding, c10::List<int64_t> dilation, bool ceil_mode=false){
+          return torch_ipex::AtenIpexTypeExt::max_pooling(self, kernel_size.vec(), stride.vec(), padding.vec(), dilation.vec(), ceil_mode);
         });
 
-

From 2cca38f91f01ce3af535dd7f563935c2d2d85232 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Wed, 27 May 2020 10:00:24 +0800
Subject: [PATCH 06/10] fix max_pool2d backward floating point exception issue

---
 .../csrc/cpu/{CustomerOps.h => CustomOPs.h}   | 20 +++++++++----------
 torch_ipex/csrc/cpu/ExtendOPs.cpp             |  2 +-
 torch_ipex/csrc/cpu/FusionOPs.cpp             |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)
 rename torch_ipex/csrc/cpu/{CustomerOps.h => CustomOPs.h} (86%)

diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomOPs.h
similarity index 86%
rename from torch_ipex/csrc/cpu/CustomerOps.h
rename to torch_ipex/csrc/cpu/CustomOPs.h
index 96a94e7db..a989d6d60 100644
--- a/torch_ipex/csrc/cpu/CustomerOps.h
+++ b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -9,8 +9,6 @@
 #include "torch_ipex/csrc/utils.h"
 #include "DevOPs.h"
 
-using namespace at;
-
 class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
   public:
       static at::Tensor forward(
@@ -40,9 +38,9 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
  
       if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
         grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_input(
-            input.sizes(), grad_output, weight);
+            input.sizes(), grad_output.contiguous(), weight);
         std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights(
-            grad_output, input, weight, bias.defined());
+            grad_output.contiguous(), input, weight, bias.defined());
       } else {
         auto grad_input = grad_output.mm(weight);
         auto grad_weight = grad_output.t().mm(input);
@@ -69,6 +67,7 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
         ctx->saved_data["padding"] = padding;
         ctx->saved_data["dilation"] = dilation;
         ctx->saved_data["ceil_mode"] = ceil_mode;
+
         if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
           at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride,
               padding, dilation, ceil_mode);
@@ -89,12 +88,13 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
       at::Tensor input = saved[0];
       at::Tensor indices = saved[1];
 
-      at::Tensor grad_output = grad_outputs[0];
+      at::Tensor grad_output = grad_outputs[0].contiguous();
       at::Tensor grad_input;
-      at::IntArrayRef kernel_size = at::IntArrayRef(ctx->saved_data["kernel_size"].toIntVector());
-      at::IntArrayRef stride = at::IntArrayRef(ctx->saved_data["stride"].toIntVector());
-      at::IntArrayRef padding = at::IntArrayRef(ctx->saved_data["padding"].toIntVector());
-      at::IntArrayRef dilation = at::IntArrayRef(ctx->saved_data["dilation"].toIntVector());
+
+      std::vector<int64_t> kernel_size = ctx->saved_data["kernel_size"].toIntVector();
+      std::vector<int64_t> stride = ctx->saved_data["stride"].toIntVector();
+      std::vector<int64_t> padding = ctx->saved_data["padding"].toIntVector();
+      std::vector<int64_t> dilation = ctx->saved_data["dilation"].toIntVector();
       bool ceil_mode = ctx->saved_data["ceil_mode"].toBool();
 
       if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
@@ -104,6 +104,6 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
         grad_input = at::max_pool2d_with_indices_backward(grad_output, input, kernel_size,
             stride, padding, dilation, ceil_mode, indices);
       }
-      return {grad_input};
+      return {grad_input, at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()};
     }
 };
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp
index ed87fb9a5..fbcb0281b 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.cpp
+++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp
@@ -10,7 +10,7 @@
 #include "xsmm/libxsmm_utils.h"
 #include "../utils.h"
 #include "DevOPs.h"
-#include "CustomerOps.h"
+#include "CustomOps.h"
 
 namespace torch_ipex {
 
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
index 2e3e1b9c5..d9fec98fa 100644
--- a/torch_ipex/csrc/cpu/FusionOPs.cpp
+++ b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -52,7 +52,7 @@ at::Tensor AtenIpexJITDev::dil_convolution_relu(
     groups,
     dil::attr_t::fuse_relu());
 
-  return dbl::comm::gen_aten_tensor_by(dil_output);
+  return dbl::comm::gen_aten_tensor_by(std::move(dil_output));
 }
 
 static at::Tensor& dil_convolution_inplace_fusion(

From 2cfb394eb5036cd8686ad1dabe2768b6eb7b70ae Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Wed, 27 May 2020 16:26:52 +0800
Subject: [PATCH 07/10] make rewrited AdaptiveAvgPool2d op can be traced

---
 intel_pytorch_extension_py/ops/pooling.py | 25 ++--------------
 torch_ipex/csrc/cpu/CustomOPs.h           | 35 +++++++++++++++++++++++
 torch_ipex/csrc/cpu/ExtendOPs.cpp         |  4 +--
 torch_ipex/csrc/cpu/RegisterOps.cpp       |  3 ++
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py
index 35710b70f..64ce169de 100644
--- a/intel_pytorch_extension_py/ops/pooling.py
+++ b/intel_pytorch_extension_py/ops/pooling.py
@@ -7,24 +7,8 @@
 
 Vector = List[int]
 
-torch_adaptive_avg_pool2d = torch._C._nn.adaptive_avg_pool2d
-torch_max_pool2d = torch.max_pool2d
 torch_max_pool3d = torch.max_pool3d
 
-class AdaptiveAvgPool2dFunction(Function):
-    @staticmethod
-    def forward(ctx, input, output_size):
-        output = core.adaptive_avg_pool2d(input, _single(output_size))
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        (input,) = ctx.saved_tensors
-        grad_output = grad_output.contiguous()
-        grad_input = core.adaptive_avg_pool2d_backward(grad_output, input)
-        return (grad_input, None)
-
 class MaxPoolingFunction(Function):
     @staticmethod
     def forward(ctx, input, kernel_size, stride, padding, dilation, ceil_mode):
@@ -44,13 +28,8 @@ def backward(ctx, grad_output):
         grad_input = core.max_pooling_backward(grad_output, output, input, ctx.kernel_size, ctx.stride, ctx.padding, ctx.dilation, ctx.ceil_mode)
         return (grad_input, None, None, None, None, None)
 
-def adaptive_avg_pool2d(input, output_size):
-    try:
-        if input.device.type == 'dpcpp' and core.get_auto_dnnl():
-            return AdaptiveAvgPool2dFunction.apply(input, output_size)
-    except RuntimeError:
-        pass
-    return torch_adaptive_avg_pool2d(input, output_size)
+def adaptive_avg_pool2d(input, output_size: Vector):
+    return torch.ops.torch_ipex.adaptive_avg_pool2d(input, _pair(output_size))
 
 def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode):
     try:
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
index a989d6d60..3d681e564 100644
--- a/torch_ipex/csrc/cpu/CustomOPs.h
+++ b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -107,3 +107,38 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
       return {grad_input, at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()};
     }
 };
+
+class NewApaptiveAvgPoolingOp : public torch::autograd::Function<NewApaptiveAvgPoolingOp> {
+  public:
+      static at::Tensor forward(
+        torch::autograd::AutogradContext* ctx,
+        at::Tensor input,
+        at::IntArrayRef output_size) {
+        ctx->save_for_backward({input});
+
+        at::Tensor output;
+        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+          output = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input, output_size);
+        } else {
+          output = at::_adaptive_avg_pool2d(input, output_size);
+        }
+        return output;
+      }
+
+    static torch::autograd::tensor_list backward(
+        torch::autograd::AutogradContext* ctx,
+        torch::autograd::tensor_list grad_outputs) {
+      auto saved = ctx->get_saved_variables();
+      at::Tensor input = saved[0];
+
+      at::Tensor grad_output = grad_outputs[0].contiguous();
+      at::Tensor grad_input;
+
+      if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+        grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d_backward(grad_output, input);
+      } else {
+        grad_input = at::_adaptive_avg_pool2d_backward(grad_output, input);
+      }
+      return {grad_input, at::Tensor()};
+    }
+};
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp
index fbcb0281b..a0cacd084 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.cpp
+++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp
@@ -10,7 +10,7 @@
 #include "xsmm/libxsmm_utils.h"
 #include "../utils.h"
 #include "DevOPs.h"
-#include "CustomOps.h"
+#include "CustomOPs.h"
 
 namespace torch_ipex {
 
@@ -466,7 +466,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> AtenIpexTypeExt::linear_backward(
 }
 
 at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d(at::Tensor const& input, at::IntArrayRef output_size) {
-    return cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input, output_size);
+    return NewApaptiveAvgPoolingOp::apply(input, output_size);
 }
 
 at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d_backward(const at::Tensor& grad_output, const at::Tensor& input) {
diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp
index f241fe341..694d0b9de 100644
--- a/torch_ipex/csrc/cpu/RegisterOps.cpp
+++ b/torch_ipex/csrc/cpu/RegisterOps.cpp
@@ -7,5 +7,8 @@ static auto registry =
        .op("torch_ipex::max_pool2d", [](const at::Tensor& self, c10::List<int64_t> kernel_size,
           c10::List<int64_t> stride, c10::List<int64_t> padding, c10::List<int64_t> dilation, bool ceil_mode=false){
           return torch_ipex::AtenIpexTypeExt::max_pooling(self, kernel_size.vec(), stride.vec(), padding.vec(), dilation.vec(), ceil_mode);
+        })
+       .op("torch_ipex::adaptive_avg_pool2d", [](const at::Tensor&self, c10::List<int64_t> output_size) {
+          return torch_ipex::AtenIpexTypeExt::adaptive_avg_pool2d(self, output_size.vec());
         });
 

From f402c30fa6ceb00be3e6c8a138a62568dda688e7 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Thu, 28 May 2020 17:09:03 +0800
Subject: [PATCH 08/10] fix linear issue when bias is None

---
 intel_pytorch_extension_py/ops/linear.py | 8 +++++++-
 torch_ipex/csrc/cpu/CustomOPs.h          | 6 +++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/intel_pytorch_extension_py/ops/linear.py b/intel_pytorch_extension_py/ops/linear.py
index 8ec9e76c9..ab9a5480e 100644
--- a/intel_pytorch_extension_py/ops/linear.py
+++ b/intel_pytorch_extension_py/ops/linear.py
@@ -2,5 +2,11 @@
 from torch.autograd import Function
 import torch.nn.functional as F
 import _torch_ipex as core
+from typing import Optional
 
-F.linear = torch.ops.torch_ipex.linear
+def linear(input, weight, bias: Optional[torch.Tensor] = None):
+    if bias is None:
+        bias = torch.zeros(weight.size(0))
+    return torch.ops.torch_ipex.linear(input, weight, bias)
+
+F.linear = linear
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
index 3d681e564..2cea2ad05 100644
--- a/torch_ipex/csrc/cpu/CustomOPs.h
+++ b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -15,7 +15,7 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
         torch::autograd::AutogradContext* ctx,
         at::Tensor input,
         at::Tensor weight,
-        at::Tensor bias) {
+        at::Tensor bias = at::Tensor()) {
         ctx->save_for_backward({input, weight, bias});
         if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
           return torch_ipex::cpu::AtenIpexCPUDev::dil_linear(input, weight, bias);
@@ -42,8 +42,8 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
         std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights(
             grad_output.contiguous(), input, weight, bias.defined());
       } else {
-        auto grad_input = grad_output.mm(weight);
-        auto grad_weight = grad_output.t().mm(input);
+        grad_input = grad_output.mm(weight);
+        grad_weight = grad_output.t().mm(input);
         if (bias.defined()) {
           grad_bias = grad_output.sum(0);
         }

From 746fe44f9188c84a13cc6950004f1d1b926fff0b Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Thu, 28 May 2020 19:01:57 +0800
Subject: [PATCH 09/10] fix max_pool2d issue with stride=None case

---
 intel_pytorch_extension_py/ops/pooling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py
index 64ce169de..12114a91f 100644
--- a/intel_pytorch_extension_py/ops/pooling.py
+++ b/intel_pytorch_extension_py/ops/pooling.py
@@ -40,6 +40,8 @@ def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode):
     return torch_max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode)
 
 def max_pool2d(input, kernel_size: Vector, stride: Vector, padding: Vector, dilation: Vector, ceil_mode: bool):
+    if not stride:
+        stride = kernel_size
     return torch.ops.torch_ipex.max_pool2d(input, _pair(kernel_size), _pair(stride), _pair(padding), _pair(dilation), ceil_mode)
 
 torch._C._nn.adaptive_avg_pool2d = adaptive_avg_pool2d

From 0fb1060be8819e3a875fe6433f1a56898af71f65 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaobing" <xiaobing.zhang@intel.com>
Date: Thu, 28 May 2020 19:40:51 +0800
Subject: [PATCH 10/10] add prepack_weight API

---
 torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
index bcfe69c27..2d5102b77 100644
--- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
+++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -100,6 +100,19 @@ RegisterOperators op({
         }
       },
       aliasAnalysisFromSchema()
+      ),
+    Operator(
+      "ipex::prepack_weight(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups) -> Tensor(a!)",
+      [] (const Node* node) ->Operation {
+        if (torch_ipex::check_auto_dnnl()) {
+          return [] (Stack& stack) {
+            return 0;
+          };
+        } else {
+          TORCH_CHECK(false, "PyTorch native path not support prepack weight now");
+        }
+      },
+      aliasAnalysisFromSchema()
       )
     });
 }