From 89541bcbe768220403a9671476fa199d4dc6e50d Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Tue, 19 May 2020 12:41:57 +0800 Subject: [PATCH 01/10] jit: enable conv_relu fusion --- cmake/CPU.cmake | 4 +- tests/cpu/test_jit.py | 111 ++++++++++++++++++ torch_ipex/csrc/auto_opt_config.h | 9 ++ torch_ipex/csrc/cpu/DevOPs.cpp | 35 ++++++ torch_ipex/csrc/cpu/DevOPs.h | 4 + torch_ipex/csrc/cpu/dbl/Conv.cpp | 15 ++- torch_ipex/csrc/cpu/dbl/Conv.h | 3 +- torch_ipex/csrc/init_python_bindings.cpp | 39 +++++- torch_ipex/csrc/jit/CMakeLists.txt | 10 ++ torch_ipex/csrc/jit/accelerated_ops.h | 2 +- torch_ipex/csrc/jit/dnnl_ops.h | 2 +- torch_ipex/csrc/jit/fusion_pass.cpp | 18 +-- torch_ipex/csrc/jit/graph_ext.cpp | 14 ++- torch_ipex/csrc/jit/graph_ext.h | 22 ++-- torch_ipex/csrc/jit/init.cpp | 21 ++-- torch_ipex/csrc/jit/op_rewrite.cpp | 17 ++- torch_ipex/csrc/jit/op_rewrite.h | 2 +- torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 52 +++++--- 18 files changed, 321 insertions(+), 59 deletions(-) create mode 100644 tests/cpu/test_jit.py create mode 100644 torch_ipex/csrc/jit/CMakeLists.txt diff --git a/cmake/CPU.cmake b/cmake/CPU.cmake index 72693d419..5d57ccfb9 100644 --- a/cmake/CPU.cmake +++ b/cmake/CPU.cmake @@ -136,9 +136,11 @@ include_directories(${DPCPP_THIRD_PARTY_ROOT}/xsmm/include) set(DPCPP_SRCS) set(DPCPP_COMMON_SRCS) set(DPCPP_CPU_SRCS) +set(DPCPP_JIT_SRCS) add_subdirectory(${DPCPP_ROOT}) add_subdirectory(${DPCPP_ROOT}/cpu) +add_subdirectory(${DPCPP_ROOT}/jit) # libxsmm include(${CMAKE_ROOT}/Modules/ExternalProject.cmake) @@ -153,7 +155,7 @@ ExternalProject_Add(xsmm INSTALL_COMMAND "" ) # Compile code with pybind11 -set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS}) +set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS} ${DPCPP_JIT_SRCS}) pybind11_add_module(${PLUGIN_NAME} SHARED ${DPCPP_SRCS}) target_link_libraries(${PLUGIN_NAME} PRIVATE ${DPCPP_THIRD_PARTY_ROOT}/xsmm/lib/libxsmm.a) diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py new file mode 100644 index 000000000..42142cdc8 --- /dev/null +++ b/tests/cpu/test_jit.py @@ -0,0 +1,111 @@ +from __future__ import division +from __future__ import print_function + +''' +From PyTorch: + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +From Caffe2: + +Copyright (c) 2016-present, Facebook Inc. All rights reserved. + +All contributions by Facebook: +Copyright (c) 2016 Facebook Inc. + +All contributions by Google: +Copyright (c) 2015 Google Inc. +All rights reserved. + +All contributions by Yangqing Jia: +Copyright (c) 2015 Yangqing Jia +All rights reserved. + +All contributions from Caffe: +Copyright(c) 2013, 2014, 2015, the respective contributors +All rights reserved. + +All other contributions: +Copyright(c) 2015, 2016 the respective contributors +All rights reserved. + +Caffe2 uses a copyright model similar to Caffe: each contributor holds +copyright over their contributions to Caffe2. The project versioning records +all such contribution and copyright details. If a contributor wants to further +mark their specific copyright on a particular contribution, they should +indicate their copyright solely in the commit message of the change when it is +committed. + +All rights reserved. +''' + +"""Tests for rn50.""" + +import math +import random +import unittest +from functools import reduce + +import torch +import torch.nn as nn +import copy + +import intel_pytorch_extension +from intel_pytorch_extension import core + +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.nn import Parameter +import torch.nn.functional as F +from torch.autograd import gradcheck +from torch.autograd.gradcheck import gradgradcheck +from torch._six import inf, nan + +from common_utils import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \ + TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \ + IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, do_test_dtypes, do_test_empty_full, \ + IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \ + skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf + +device = 'dpcpp:0' +#device = 'cpu:0' +SIZE = 100 + +torch._C._jit_set_profiling_mode(False) +torch._C._jit_set_profiling_executor(False) + +class Conv_relu(nn.Module): + def __init__(self): + super(Conv_relu, self).__init__() + torch.manual_seed(2018) + self.conv = torch.nn.Conv2d(20, 20, 5) + + def forward(self, x): + x = self.conv(x) + return x.relu() + +class TestJITOP(TestCase): + def test_conv_relu_fusion(self): + x = torch.randn(1, 20, 20, 20).to('dpcpp') + + model = Conv_relu().to('dpcpp').eval() + + with torch.no_grad(): + core.disable_jit() + y1 = model(x) + core.enable_jit() + script_model = torch.jit.script(model) + y2 = script_model(x) + self.assertEqual(y1, y2) + +if __name__ == '__main__': + core.enable_auto_dnnl() + test = unittest.main() diff --git a/torch_ipex/csrc/auto_opt_config.h b/torch_ipex/csrc/auto_opt_config.h index 333a0adfd..2f950edef 100644 --- a/torch_ipex/csrc/auto_opt_config.h +++ b/torch_ipex/csrc/auto_opt_config.h @@ -17,6 +17,14 @@ class AutoOptConfig { return auto_dnnl_; } + inline void set_jit_fuse(bool jit_fuse) { + jit_fuse_ = jit_fuse; + } + + inline bool get_jit_fuse() { + return jit_fuse_; + } + inline void set_mix_bf16_fp32(bool value) { mix_bf16_fp32_ = value; } @@ -39,6 +47,7 @@ class AutoOptConfig { private: bool auto_dnnl_; + bool jit_fuse_; bool mix_bf16_fp32_; bool pure_bf16_; }; diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp index ebe231f42..a162d7863 100644 --- a/torch_ipex/csrc/cpu/DevOPs.cpp +++ b/torch_ipex/csrc/cpu/DevOPs.cpp @@ -67,6 +67,41 @@ at::Tensor AtenIpexCPUDev::dil_convolution( return dbl::comm::gen_aten_tensor_by(std::move(dil_output)); } +at::Tensor AtenIpexCPUDev::dil_convolution_relu( + const at::Tensor & input, + const at::Tensor & weight, + const at::Tensor & bias, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + int64_t groups) { + DEBUG("AtenIpexCPUDev::dil_convolution\n"); + dil::tensor dil_input; + dil::tensor dil_weight; + c10::optional dil_bias{c10::nullopt}; + + CHECK_DNNL_OP_PRE_COND(input); + CHECK_DNNL_OP_PRE_COND(weight); + dil_input = dbl::comm::try_gen_dil_tensor(input); + dil_weight = dbl::comm::try_gen_dil_tensor(weight); + if (bias.defined()) { + CHECK_DNNL_OP_PRE_COND(bias); + dil_bias = dbl::comm::try_gen_dil_tensor(bias); + } + + dil::tensor dil_output = dbl::conv::conv2d_impl( + dil_input, + dil_weight, + dil_bias, + padding, + stride, + dilation, + groups, + dil::attr_t::fuse_relu()); + + return dbl::comm::gen_aten_tensor_by(dil_output); +} + at::Tensor dil_convolution_backward_input( at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool bias_defined) diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h index 7c76873e6..856bee0a7 100644 --- a/torch_ipex/csrc/cpu/DevOPs.h +++ b/torch_ipex/csrc/cpu/DevOPs.h @@ -69,6 +69,10 @@ class AtenIpexCPUDev { static at::Tensor dil_cat(at::TensorList tensors, int64_t dim); static std::vector dil_split_with_sizes(const at::Tensor& self, at::IntArrayRef split_sizes, int64_t dim); static std::vector dil_split(const at::Tensor& self, int64_t split_size, int64_t dim); + + // for JIT ops + static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); + }; } // namespace cpu diff --git a/torch_ipex/csrc/cpu/dbl/Conv.cpp b/torch_ipex/csrc/cpu/dbl/Conv.cpp index b8576e669..9dadfdebd 100644 --- a/torch_ipex/csrc/cpu/dbl/Conv.cpp +++ b/torch_ipex/csrc/cpu/dbl/Conv.cpp @@ -31,7 +31,8 @@ dil::tensor conv2d_impl( at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, - int64_t groups) { + int64_t groups, + const dil::attr_t& attr) { std::vector kernel_size(x.ndims()); // mkldnn conv2d weights could have been re-ordered to 5d by // mkldnn_reorder_conv2d_weight @@ -61,7 +62,11 @@ dil::tensor conv2d_impl( {dilation.begin(), dilation.end()}, {padding.begin(), padding.end()}, {padding.begin(), padding.end()}, - groups); + groups, + dil::scale_t(), + dil::scale_t(), + dil::scale_t(), + attr); } else { dil::convolution_forward::compute( x, @@ -72,7 +77,11 @@ dil::tensor conv2d_impl( {dilation.begin(), dilation.end()}, {padding.begin(), padding.end()}, {padding.begin(), padding.end()}, - groups); + groups, + dil::scale_t(), + dil::scale_t(), + dil::scale_t(), + attr); } return y; } diff --git a/torch_ipex/csrc/cpu/dbl/Conv.h b/torch_ipex/csrc/cpu/dbl/Conv.h index 224551ca4..e4d41aa33 100644 --- a/torch_ipex/csrc/cpu/dbl/Conv.h +++ b/torch_ipex/csrc/cpu/dbl/Conv.h @@ -25,7 +25,8 @@ dil::tensor conv2d_impl( at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, - int64_t groups); + int64_t groups, + const dil::attr_t& attr = dil::attr_t()); } // namespace conv } // namespace dbl diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp index b50eca837..4ffe64246 100644 --- a/torch_ipex/csrc/init_python_bindings.cpp +++ b/torch_ipex/csrc/init_python_bindings.cpp @@ -5,6 +5,13 @@ #include #include +#include +#include +#include +#include +#include "jit/fusion_pass.h" +#include "jit/op_rewrite.h" + #include #include #include @@ -128,15 +135,41 @@ void InitIpexModuleBindings(py::module m) { m.def("mlp_create_handle", &AtenIpexTypeMLPExt::create_handle); m.def("mlp_set_relu_mask", &AtenIpexTypeMLPExt::set_relu_mask); m.def("mlp_release_handle", &AtenIpexTypeMLPExt::release_handle); - m.def("is_dil_tensor", &isDilTensor); m.def("get_dil_tensor_sizes", &getDilTensorSizes); m.def("get_dil_tensor_strides", &getDilTensorStrides); + m.def("enable_jit", []() { AutoOptConfig::singleton().set_jit_fuse(true); }); + m.def("disable_jit", []() { AutoOptConfig::singleton().set_jit_fuse(false); }); + m.def("get_jit", []() { return AutoOptConfig::singleton().get_jit_fuse(); }); } } // namespace - -void InitIpexBindings(py::module m) { InitIpexModuleBindings(m); } +using namespace torch::jit; + +void InitIpexBindings(py::module m) { + InitIpexModuleBindings(m); + + // fro jit path + RegisterPass pass_1([](std::shared_ptr& g) { + if (AutoOptConfig::singleton().get_jit_fuse()) { + torch::jit::OpRewritePass(g); + } + }); + /* + RegisterPass pass_2([](std::shared_ptr& g) { + if (AutoOptConfig::singleton().get_jit_fuse()) { + std::cout<<"uisng pass2"<& g) { + if (AutoOptConfig::singleton().get_jit_fuse()) { + torch::jit::FusionPass(g); + } + }); +} } // namespace torch_ipex diff --git a/torch_ipex/csrc/jit/CMakeLists.txt b/torch_ipex/csrc/jit/CMakeLists.txt new file mode 100644 index 000000000..58f3e2729 --- /dev/null +++ b/torch_ipex/csrc/jit/CMakeLists.txt @@ -0,0 +1,10 @@ +LIST(APPEND DPCPP_JIT_SRCS + ${DPCPP_ROOT}/jit/fusion_pass.cpp + ${DPCPP_ROOT}/jit/graph_ext.cpp + ${DPCPP_ROOT}/jit/op_rewrite.cpp + ${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp + +) + +# Pass to parent +set(DPCPP_JIT_SRCS ${DPCPP_JIT_SRCS} PARENT_SCOPE) diff --git a/torch_ipex/csrc/jit/accelerated_ops.h b/torch_ipex/csrc/jit/accelerated_ops.h index 3d4b6944b..9183334f9 100644 --- a/torch_ipex/csrc/jit/accelerated_ops.h +++ b/torch_ipex/csrc/jit/accelerated_ops.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "cpu/dil/dil.hpp" #include namespace torch { namespace jit { diff --git a/torch_ipex/csrc/jit/dnnl_ops.h b/torch_ipex/csrc/jit/dnnl_ops.h index 547c03675..abb7de3de 100644 --- a/torch_ipex/csrc/jit/dnnl_ops.h +++ b/torch_ipex/csrc/jit/dnnl_ops.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "cpu/dil/dil.hpp" #include #include diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp index 62ca9c86c..ef528e432 100644 --- a/torch_ipex/csrc/jit/fusion_pass.cpp +++ b/torch_ipex/csrc/jit/fusion_pass.cpp @@ -88,19 +88,20 @@ class OpFuser { // currently we only have to fold conv2d + batch_norm // bool isFoldable(Node* node, Node* prev) { - bool foldable = (node->kind() == dnnl::batch_norm - && prev->kind() == dnnl::conv2d); - + bool foldable = (node->kind() == aten::batch_norm + && prev->kind() == aten::conv2d); // // Check whether all the sources are constant ??? // Does performance improve no matter we do it pre-compiling or runtime? // + auto* conv2d = reinterpret_cast(prev)->cast(); auto* batch_norm = reinterpret_cast(node)->cast(); foldable = foldable && conv2d->hasConstantParams() && batch_norm->hasConstantParams(); + return foldable; } @@ -125,6 +126,7 @@ class OpFuser { newNode->setScope(conv2d->scope()); // We need following parameters + newNode->addInput(conv2d->input(1)); // Conv2d weights newNode->addInput(batch_norm->input(1)); // Batch norm weights newNode->addInput(batch_norm->input(4)); // running_var (delta) @@ -134,7 +136,6 @@ class OpFuser { newNode->output()->copyMetadata(conv2d->input(1)); newNode->output()->setType(conv2d->input(1)->type()); newNode->output()->setDebugName(conv2d->input(1)->debugName() + ".bn_folded"); - return newNode; } @@ -198,7 +199,8 @@ class OpFuser { } // throw - auto er = script::ErrorReport(node->sourceRange()); + //auto er = script::ErrorReport(node->sourceRange()); + auto er = ErrorReport(node->sourceRange()); er << "Schema not found for fusion process. \n"; er << "Prev: " << *prev << "\n"; er << "Node: " << *node << "\n"; @@ -323,7 +325,7 @@ class OpFuser { } return std::make_pair(++pos->iterator(), changed); -} + } }; // TODO: These rules should be more scalable @@ -334,12 +336,14 @@ OpFuser::RuleTab OpFuser::dnnlRules = { {{dnnl::batch_norm, dnnl::relu}, dnnl::batch_norm_relu}, {{dnnl::batch_norm, dnnl::relu_}, dnnl::batch_norm_relu}, */ + /* {{dnnl::conv2d_sum, dnnl::relu}, dnnl::conv2d_sum_relu}, {{dnnl::conv2d_sum, dnnl::relu_}, dnnl::conv2d_sum_relu}, {{dnnl::conv2d, dnnl::sum}, dnnl::conv2d_sum}, {{dnnl::conv2d, dnnl::sum_}, dnnl::conv2d_sum}, - // {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum} + {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum} + */ }; void FusionPass(std::shared_ptr &graph) { diff --git a/torch_ipex/csrc/jit/graph_ext.cpp b/torch_ipex/csrc/jit/graph_ext.cpp index 46b6ef6bf..658aa4011 100644 --- a/torch_ipex/csrc/jit/graph_ext.cpp +++ b/torch_ipex/csrc/jit/graph_ext.cpp @@ -2,6 +2,7 @@ #include "accelerated_ops.h" namespace torch { namespace jit { +/* void NodeExt::initFormatInfo() { std::vector formatInfo ( this->inputs().size() + this->outputs().size(), @@ -9,7 +10,7 @@ void NodeExt::initFormatInfo() { this->is_(attr::format_info, std::move(formatInfo)); } - +*/ const std::vector& NodeExt::getFormatInfo() const { return this->is(attr::format_info); } @@ -40,6 +41,7 @@ void NodeExt::setGroupInfo(int64_t groups) { this->i_(attr::group_info, groups); } +/* Node *NodeExt::createReorder(Value *v, Graph *g, formatTag from, formatTag to) { NodeExt *reorder = nullptr; if (from != to) { @@ -106,7 +108,7 @@ Node* NodeExt::appendReorder(formatTag to, int i) { return reorder; } - +*/ void NodeExt::propagateFormats() { // TODO: Need consultant with acceleration libraries setOutputFormat(inputFormat()); @@ -185,7 +187,7 @@ bool Conv2dNode::hasConstantParams() const { return has; } - +/* formatTag Conv2dNode::expectedWeightFormat( c10::ArrayRef sizes, c10::List stride, @@ -207,7 +209,6 @@ formatTag Conv2dNode::expectedWeightFormat( return desc.get_internal_format(); } - void Conv2dNode::fixWeightFormatIfPossible() { if (couldInferFormats()) { auto tensor = toIValue(this->input(1))->toTensor(); @@ -222,8 +223,9 @@ void Conv2dNode::fixWeightFormatIfPossible() { this->prependReorders(use_list {{this, 1}}, {natureWeightFormat}, {groups}); } } - +*/ bool BatchNorm2dNode::hasConstantParams() const { + /* bool has = this->input(1)->node()->kind() == prim::Constant && this->input(2)->node()->kind() == prim::Constant @@ -234,6 +236,8 @@ bool BatchNorm2dNode::hasConstantParams() const { // TODO: more check to make sure return has; + */ + return true; } }} // namespace torch::jit diff --git a/torch_ipex/csrc/jit/graph_ext.h b/torch_ipex/csrc/jit/graph_ext.h index 34a141854..74f762a37 100644 --- a/torch_ipex/csrc/jit/graph_ext.h +++ b/torch_ipex/csrc/jit/graph_ext.h @@ -2,21 +2,23 @@ #include #include -#include + +#include "cpu/dil/dil.hpp" #include "accelerated_ops.h" + #include #include #include namespace torch { namespace jit { -using namespace ideep; -using dataType = ideep::tensor::data_type; -using formatTag = ideep::format; +using namespace dil; +using dataType = dil::tensor::data_type; +using formatTag = dil::format_tag; using formatList = std::vector; using groupsList = std::vector; -static constexpr auto natureFormat = formatTag::nchw; -static constexpr auto natureWeightFormat = formatTag::oihw; +//static constexpr auto natureFormat = formatTag::nchw; +//static constexpr auto natureWeightFormat = formatTag::oihw; // attributes for pyrys ops to decide which format is on // Or what formats transfered by reorder @@ -71,7 +73,7 @@ class NodeExt : public Node { return this->kind() == dnnl::batch_norm; } - void initFormatInfo(); + //void initFormatInfo(); template T* cast() { return reinterpret_cast(this); @@ -79,18 +81,19 @@ class NodeExt : public Node { private: // we save formats as Ints attribute internally const std::vector& getFormatInfo() const; - + /* static Node* createReorder( Value *v, Graph *g, formatTag from, formatTag to); static Node* insertReorder( Value *v, Node *insert_point, formatTag from, formatTag to); + */ }; class Conv2dNode : public NodeExt { public: bool couldInferFormats() const; bool hasConstantParams() const; - void fixWeightFormatIfPossible(); + //void fixWeightFormatIfPossible(); formatTag expectedWeightFormat( c10::ArrayRef sizes, c10::List stride, @@ -99,6 +102,7 @@ class Conv2dNode : public NodeExt { int64_t groups, dataType dtype = dataType::f32) const; }; + class BatchNorm2dNode : public NodeExt { public: bool hasConstantParams() const; diff --git a/torch_ipex/csrc/jit/init.cpp b/torch_ipex/csrc/jit/init.cpp index 487e7019e..3316d4b4f 100644 --- a/torch_ipex/csrc/jit/init.cpp +++ b/torch_ipex/csrc/jit/init.cpp @@ -7,19 +7,22 @@ #include #include "accelerated_ops.h" -#include "op_rewrite.h" -#include "format_analysis.h" +//#include "op_rewrite.h" +//#include "format_analysis.h" #include "fusion_pass.h" -#include "dnnl_ops.h" +//#include "dnnl_ops.h" namespace py = pybind11; using namespace torch::jit; -static bool pyrys_enabled = false; +//static bool jit_enabled = false; + +static bool jit_enabled = true; PYBIND11_MODULE(pyrys, m) { m.doc() = "A DO fusion backend for Pytorch JIT"; + /* RegisterPass pass_1([](std::shared_ptr& g) { if (pyrys_enabled) { torch::jit::OpRewritePass(g); @@ -30,14 +33,17 @@ PYBIND11_MODULE(pyrys, m) { torch::jit::FormatOptimize(g); } }); + */ RegisterPass pass_3([](std::shared_ptr& g) { - if (pyrys_enabled) { + if (jit_enabled) { + std::cout << "in init\n"; torch::jit::FusionPass(g); } }); - m.def("enable", []() { pyrys_enabled = true; }); - m.def("disable", []() { pyrys_enabled = false; }); + m.def("enable", []() { jit_enabled = true; }); + m.def("disable", []() { jit_enabled = false; }); + /* m.def("dnnl_conv2d", at::native::dnnl_conv2d, "A conv2d function of dnnl"); m.def("dnnl_conv2d_relu", at::native::dnnl_conv2d_relu, "A conv2d_relu function of dnnl"); m.def("dnnl_relu", at::native::dnnl_relu, "A relu function of dnnl"); @@ -45,4 +51,5 @@ PYBIND11_MODULE(pyrys, m) { m.def("dnnl_batch_norm", at::native::dnnl_batch_norm, "A batch_norm function of dnnl"); m.def("dnnl_pooling_max_2d", at::native::dnnl_pooling_max_2d, "A max-pooling-2d funtion of dnnl"); m.def("dnnl_pooling_avg_2d", at::native::dnnl_pooling_avg_2d, "An avg-pooling-2d funtion of dnnl"); + */ } diff --git a/torch_ipex/csrc/jit/op_rewrite.cpp b/torch_ipex/csrc/jit/op_rewrite.cpp index 481cba542..8a5a65efe 100644 --- a/torch_ipex/csrc/jit/op_rewrite.cpp +++ b/torch_ipex/csrc/jit/op_rewrite.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include "graph_ext.h" #include "op_rewrite.h" @@ -24,7 +24,7 @@ NodeExt* replaceOpWithDNNL(Node *node, Graph *g) { auto* replacement = reinterpret_cast( replaceOpWithNewKind(node, g, rules.at(node->kind()))); - replacement->initFormatInfo(); + //replacement->initFormatInfo(); return replacement; } @@ -47,10 +47,11 @@ void OpRewritePass(Block *block) { // need a reorder to transform it back // auto newNode = replaceOpWithDNNL(node, block->owningGraph()); + /* auto conv2d = newNode->cast(); conv2d->fixWeightFormatIfPossible(); conv2d->appendReorder(natureFormat); - + */ // If we could get more information about the weights // We could prepend a reorder for the weights and constant propagation // might help us create a MKL-DNN friendly weight @@ -62,20 +63,24 @@ void OpRewritePass(Block *block) { // auto lh_node = node->input(0)->node(); auto rh_node = node->input(1)->node(); + /* auto by_pass_reorder = [](const Node *n) { return (n->kind() == dnnl::reorder) ? n->input()->node() : n; }; - + */ // // higher priority for conv+sum fusion than other kind // possibly we check whether there is a chance for conv+sum+relu // + /* if (by_pass_reorder(lh_node)->kind() == dnnl::conv2d || by_pass_reorder(rh_node)->kind() == dnnl::conv2d || by_pass_reorder(lh_node)->kind() == dnnl::batch_norm || by_pass_reorder(rh_node)->kind() == dnnl::batch_norm) replaceOpWithDNNL(node, block->owningGraph()); + */ + replaceOpWithDNNL(node, block->owningGraph()); } else if (node->matches("aten::relu(Tensor self) -> Tensor") || node->matches("aten::relu_(Tensor(a!) self) -> Tensor(a!)") || node->matches( @@ -102,10 +107,10 @@ void OpRewritePass(Block *block) { } auto newNode = replaceOpWithDNNL(node, block->owningGraph()); - newNode->appendReorder(natureFormat); + //newNode->appendReorder(natureFormat); } else if (node->matches("aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride=[], int[] padding, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor")) { auto newNode = replaceOpWithDNNL(node, block->owningGraph()); - newNode->appendReorder(natureFormat); + //newNode->appendReorder(natureFormat); } } } diff --git a/torch_ipex/csrc/jit/op_rewrite.h b/torch_ipex/csrc/jit/op_rewrite.h index cac609ba7..2c1ab26a8 100644 --- a/torch_ipex/csrc/jit/op_rewrite.h +++ b/torch_ipex/csrc/jit/op_rewrite.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include "cpu/dil/dil.hpp" #include namespace torch { namespace jit { diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp index 487e2711d..9417ac716 100644 --- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp +++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp @@ -2,15 +2,15 @@ #include "torch/csrc/jit/runtime/custom_operator.h" #include "accelerated_ops.h" #include "graph_ext.h" -#include "dnnl_ops.h" +#include "cpu/DevOPs.h" +//#include "dnnl_ops.h" + namespace torch { namespace jit { -c10::OperatorOptions aliasAnalysisFromSchema() { - c10::OperatorOptions result; - result.setAliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA); - return result; +c10::AliasAnalysisKind aliasAnalysisFromSchema() { + return c10::AliasAnalysisKind::FROM_SCHEMA; } at::Tensor toOptionalTensor(const IValue& v) { @@ -20,7 +20,7 @@ at::Tensor toOptionalTensor(const IValue& v) { return v.toTensor(); } -using namespace at::native; +using namespace torch_ipex::cpu; RegisterOperators op({ Operator( @@ -32,8 +32,9 @@ RegisterOperators op({ auto to = enode->inputFormat(1); auto groups = enode->getGroupInfo(); - auto result = dnnl_reorder( - (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups); + // auto result = dnnl_reorder( + // (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups); + auto result = at::Tensor(); drop(stack, 1); pack(stack, std::move(result)); return 0; @@ -45,8 +46,7 @@ RegisterOperators op({ "dnnl::relu(Tensor self) -> Tensor", [](const Node* node) -> Operation { return [] (Stack& stack) { - auto result = dnnl_relu( - (std::move(peek(stack, 0, 1))).toTensor()); + auto result = AtenIpexCPUDev::dil_relu((std::move(peek(stack, 0, 1))).toTensor()); drop(stack, 1); pack(stack, std::move(result)); return 0; @@ -60,7 +60,7 @@ RegisterOperators op({ return [] (Stack& stack) { at::Tensor input; pop(stack, input); - auto result = dnnl_relu_(input); + auto result = AtenIpexCPUDev::dil_relu_(input); push(stack, std::move(result)); return 0; }; @@ -71,7 +71,7 @@ RegisterOperators op({ "dnnl::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", [] (const Node* node) -> Operation { return [] (Stack& stack) { - auto result = dnnl_conv2d( + auto result = AtenIpexCPUDev::dil_convolution( (std::move(peek(stack, 0, 7))).toTensor(), (std::move(peek(stack, 1, 7))).toTensor(), toOptionalTensor(std::move(peek(stack, 2, 7))), @@ -90,7 +90,7 @@ RegisterOperators op({ "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", [] (const Node* node) ->Operation { return [] (Stack& stack) { - auto result = dnnl_conv2d_relu( + auto result = AtenIpexCPUDev::dil_convolution_relu( (std::move(peek(stack, 0, 7))).toTensor(), (std::move(peek(stack, 1, 7))).toTensor(), toOptionalTensor(std::move(peek(stack, 2, 7))), @@ -109,6 +109,7 @@ RegisterOperators op({ "dnnl::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", [] (const Node* node) ->Operation { return [] (Stack& stack) { + /* auto result = dnnl_batch_norm( (std::move(peek(stack, 0, 9))).toTensor(), toOptionalTensor(std::move(peek(stack, 1, 9))), @@ -119,6 +120,8 @@ RegisterOperators op({ (std::move(peek(stack, 6, 9))).toDouble(), (std::move(peek(stack, 7, 9))).toDouble(), (std::move(peek(stack, 8, 9))).toBool()); + */ + auto result = at::Tensor(); drop(stack, 9); pack(stack, std::move(result)); return 0; @@ -130,11 +133,14 @@ RegisterOperators op({ "dnnl::fold_weight(Tensor weight, Tensor? bn_weight, Tensor? running_var, float eps) -> Tensor", [] (const Node* node) -> Operation { return [] (Stack& stack) { + /* auto result = dnnl_fold_weight( (std::move(peek(stack, 0, 4))).toTensor(), toOptionalTensor(std::move(peek(stack, 1, 4))), toOptionalTensor(std::move(peek(stack, 2, 4))), (std::move(peek(stack, 3, 4))).toDouble()); + */ + auto result = at::Tensor(); drop(stack, 4); pack(stack, std::move(result)); return 0; @@ -146,6 +152,7 @@ RegisterOperators op({ "dnnl::fold_bias(Tensor weight, Tensor? bias, Tensor? bn_weight, Tensor? bn_bias, Tensor? running_mean, Tensor? running_var, float eps) -> Tensor", [] (const Node* node) -> Operation{ return [] (Stack& stack) { + /* auto result = dnnl_fold_bias( (std::move(peek(stack, 0, 7))).toTensor(), toOptionalTensor(std::move(peek(stack, 1, 7))), @@ -154,6 +161,8 @@ RegisterOperators op({ toOptionalTensor(std::move(peek(stack, 4, 7))), toOptionalTensor(std::move(peek(stack, 5, 7))), (std::move(peek(stack, 6, 7))).toDouble()); + */ + auto result = at::Tensor(); drop(stack, 7); pack(stack, std::move(result)); return 0; @@ -165,11 +174,14 @@ RegisterOperators op({ "dnnl::sum(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", [] (const Node* node) ->Operation { return [] (Stack& stack) { + /* auto result = dnnl_sum( (std::move(peek(stack, 0, 3))).toTensor(), (std::move(peek(stack, 1, 3))).toTensor(), (std::move(peek(stack, 2, 3))).toScalar() ); + */ + auto result = at::Tensor(); drop(stack, 3); pack(stack, std::move(result)); return 0; @@ -182,10 +194,13 @@ RegisterOperators op({ [] (const Node* node) ->Operation{ return [](Stack &stack) { auto self = (std::move(peek(stack, 0, 3))).toTensor(); + /* auto result = dnnl_sum_( self, (std::move(peek(stack, 1, 3))).toTensor(), (std::move(peek(stack, 2, 3))).toScalar()); + */ + auto result = at::Tensor(); drop(stack, 3); pack(stack, std::move(result)); return 0; @@ -193,6 +208,7 @@ RegisterOperators op({ }, aliasAnalysisFromSchema() ), + /* Operator( "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)", [] (const Node* node) ->Operation { @@ -209,6 +225,7 @@ RegisterOperators op({ output, (std::move(peek(stack, 8, 9))).toScalar() ); + auto result = at::Tensor(); drop(stack, 9); pack(stack, std::move(result)); return 0; @@ -232,17 +249,19 @@ RegisterOperators op({ output, (std::move(peek(stack, 8, 9))).toScalar() ); + auto result = at::Tensor(); drop(stack, 9); pack(stack, std::move(result)); return 0; }; }, aliasAnalysisFromSchema() - ), + ),*/ Operator( "dnnl::pooling_max_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=0) -> Tensor(a!)", [] (const Node *node) ->Operation { return [] (Stack& stack) { + /* auto result = dnnl_pooling_max_2d( (std::move(peek(stack, 0, 6))).toTensor(), // Input tensor (std::move(peek(stack, 1, 6))).toIntVector(), // Kernel size @@ -250,6 +269,8 @@ RegisterOperators op({ (std::move(peek(stack, 3, 6))).toIntVector(), // Padding (std::move(peek(stack, 4, 6))).toIntVector(), // Dilation (std::move(peek(stack, 5, 6))).toBool()); // Ceil mode + */ + auto result = at::Tensor(); drop(stack, 6); pack(stack, std::move(result)); return 0; @@ -261,12 +282,15 @@ RegisterOperators op({ "dnnl::pooling_avg_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, bool ceil_mode=0, bool count_include_pad=True, int? divisor_override=None) -> Tensor(a!)", [] (const Node *node) ->Operation { return [] (Stack& stack) { + /* auto result = dnnl_pooling_avg_2d( (std::move(peek(stack, 0, 7))).toTensor(), // Input tensor (std::move(peek(stack, 1, 7))).toIntVector(), // Kernel size (std::move(peek(stack, 2, 7))).toIntVector(), // Stride (std::move(peek(stack, 3, 7))).toIntVector(), // Padding (std::move(peek(stack, 4, 7))).toBool()); // Ceil mode + */ + auto result = at::Tensor(); drop(stack, 7); pack(stack, std::move(result)); return 0; From bbd70f8e4b8e107bdbf95c5b0fdeeeab56c1ac98 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Tue, 19 May 2020 18:30:03 +0800 Subject: [PATCH 02/10] only jit fusion for extension path --- torch_ipex/csrc/cpu/DevOPs.cpp | 35 --- torch_ipex/csrc/cpu/DevOPs.h | 3 - torch_ipex/csrc/cpu/FusionOPs.cpp | 59 ++++ torch_ipex/csrc/cpu/FusionOPs.h | 35 +++ torch_ipex/csrc/init_python_bindings.cpp | 18 +- torch_ipex/csrc/jit/CMakeLists.txt | 2 - torch_ipex/csrc/jit/accelerated_ops.h | 2 +- torch_ipex/csrc/jit/dnnl_ops.h | 2 +- torch_ipex/csrc/jit/fusion_pass.cpp | 152 ++++------- torch_ipex/csrc/jit/graph_ext.cpp | 13 +- torch_ipex/csrc/jit/graph_ext.h | 22 +- torch_ipex/csrc/jit/init.cpp | 21 +- torch_ipex/csrc/jit/op_rewrite.cpp | 17 +- torch_ipex/csrc/jit/op_rewrite.h | 2 +- torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 256 ++---------------- 15 files changed, 198 insertions(+), 441 deletions(-) create mode 100644 torch_ipex/csrc/cpu/FusionOPs.cpp create mode 100644 torch_ipex/csrc/cpu/FusionOPs.h diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp index a162d7863..ebe231f42 100644 --- a/torch_ipex/csrc/cpu/DevOPs.cpp +++ b/torch_ipex/csrc/cpu/DevOPs.cpp @@ -67,41 +67,6 @@ at::Tensor AtenIpexCPUDev::dil_convolution( return dbl::comm::gen_aten_tensor_by(std::move(dil_output)); } -at::Tensor AtenIpexCPUDev::dil_convolution_relu( - const at::Tensor & input, - const at::Tensor & weight, - const at::Tensor & bias, - at::IntArrayRef stride, - at::IntArrayRef padding, - at::IntArrayRef dilation, - int64_t groups) { - DEBUG("AtenIpexCPUDev::dil_convolution\n"); - dil::tensor dil_input; - dil::tensor dil_weight; - c10::optional dil_bias{c10::nullopt}; - - CHECK_DNNL_OP_PRE_COND(input); - CHECK_DNNL_OP_PRE_COND(weight); - dil_input = dbl::comm::try_gen_dil_tensor(input); - dil_weight = dbl::comm::try_gen_dil_tensor(weight); - if (bias.defined()) { - CHECK_DNNL_OP_PRE_COND(bias); - dil_bias = dbl::comm::try_gen_dil_tensor(bias); - } - - dil::tensor dil_output = dbl::conv::conv2d_impl( - dil_input, - dil_weight, - dil_bias, - padding, - stride, - dilation, - groups, - dil::attr_t::fuse_relu()); - - return dbl::comm::gen_aten_tensor_by(dil_output); -} - at::Tensor dil_convolution_backward_input( at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool bias_defined) diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h index 856bee0a7..941bf93a8 100644 --- a/torch_ipex/csrc/cpu/DevOPs.h +++ b/torch_ipex/csrc/cpu/DevOPs.h @@ -70,9 +70,6 @@ class AtenIpexCPUDev { static std::vector dil_split_with_sizes(const at::Tensor& self, at::IntArrayRef split_sizes, int64_t dim); static std::vector dil_split(const at::Tensor& self, int64_t split_size, int64_t dim); - // for JIT ops - static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); - }; } // namespace cpu diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp new file mode 100644 index 000000000..aa06f5105 --- /dev/null +++ b/torch_ipex/csrc/cpu/FusionOPs.cpp @@ -0,0 +1,59 @@ +#include "torch_ipex/csrc/cpu/FusionOPs.h" + +#include +#include +#include +#include +#include + +#include + +#include "torch_ipex/csrc/aten_ipex_bridge.h" +#include "torch_ipex/csrc/ipex_tensor_impl.h" +#include "torch_ipex/csrc/utils.h" +#include "dbl/Common.h" +#include "dbl/Conv.h" +#include "ShadeDataContext.h" + +#include "dil/dil.hpp" + +namespace torch_ipex { +namespace cpu { + +at::Tensor AtenIpexJITDev::dil_convolution_relu( + const at::Tensor & input, + const at::Tensor & weight, + const at::Tensor & bias, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + int64_t groups) { + dil::tensor dil_input; + dil::tensor dil_weight; + c10::optional dil_bias{c10::nullopt}; + + auto input_contiguous = input.contiguous(); + auto weight_contiguous = weight.contiguous(); + + dil_input = dbl::comm::try_gen_dil_tensor(input_contiguous); + dil_weight = dbl::comm::try_gen_dil_tensor(weight_contiguous); + if (bias.defined()) { + auto bias_contiguous = bias.contiguous(); + dil_bias = dbl::comm::try_gen_dil_tensor(bias_contiguous); + } + + dil::tensor dil_output = dbl::conv::conv2d_impl( + dil_input, + dil_weight, + dil_bias, + padding, + stride, + dilation, + groups, + dil::attr_t::fuse_relu()); + + return dbl::comm::gen_aten_tensor_by(dil_output); +} + +} // namespace cpu +} // namespace torch_ipex diff --git a/torch_ipex/csrc/cpu/FusionOPs.h b/torch_ipex/csrc/cpu/FusionOPs.h new file mode 100644 index 000000000..14f3db7e1 --- /dev/null +++ b/torch_ipex/csrc/cpu/FusionOPs.h @@ -0,0 +1,35 @@ +#pragma once + +#include + +#include + +#include "dil/dil.hpp" + +namespace torch { namespace jit { + +// XXX: PyTorch does not support nesting namespace +// And the alias analysis is not working for namespace other than aten ... +// So we fake some op namespaces to workaround that. +namespace dnnl { + static auto conv2d_relu = Symbol::fromQualString("dnnl::conv2d_relu"); + static auto conv2d_sum = Symbol::fromQualString("dnnl::conv2d_sum"); + static auto conv2d_relu_sum = Symbol::fromQualString("dnnl::conv2d_relu_sum"); + static auto conv2d_sum_relu = Symbol::fromQualString("dnnl::conv2d_sum_relu"); + +} + +}} // namespace torch::jit + +namespace torch_ipex { +namespace cpu { + +class AtenIpexJITDev { + public: + // for JIT ops + static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); + +}; + +} // namespace cpu +} // namespace torch_ipex diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp index 4ffe64246..33492bf09 100644 --- a/torch_ipex/csrc/init_python_bindings.cpp +++ b/torch_ipex/csrc/init_python_bindings.cpp @@ -10,7 +10,6 @@ #include #include #include "jit/fusion_pass.h" -#include "jit/op_rewrite.h" #include #include @@ -148,23 +147,8 @@ using namespace torch::jit; void InitIpexBindings(py::module m) { InitIpexModuleBindings(m); - - // fro jit path - RegisterPass pass_1([](std::shared_ptr& g) { - if (AutoOptConfig::singleton().get_jit_fuse()) { - torch::jit::OpRewritePass(g); - } - }); - /* - RegisterPass pass_2([](std::shared_ptr& g) { - if (AutoOptConfig::singleton().get_jit_fuse()) { - std::cout<<"uisng pass2"<& g) { + RegisterPass pass([](std::shared_ptr& g) { if (AutoOptConfig::singleton().get_jit_fuse()) { torch::jit::FusionPass(g); } diff --git a/torch_ipex/csrc/jit/CMakeLists.txt b/torch_ipex/csrc/jit/CMakeLists.txt index 58f3e2729..3f313b336 100644 --- a/torch_ipex/csrc/jit/CMakeLists.txt +++ b/torch_ipex/csrc/jit/CMakeLists.txt @@ -1,7 +1,5 @@ LIST(APPEND DPCPP_JIT_SRCS ${DPCPP_ROOT}/jit/fusion_pass.cpp - ${DPCPP_ROOT}/jit/graph_ext.cpp - ${DPCPP_ROOT}/jit/op_rewrite.cpp ${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp ) diff --git a/torch_ipex/csrc/jit/accelerated_ops.h b/torch_ipex/csrc/jit/accelerated_ops.h index 9183334f9..3d4b6944b 100644 --- a/torch_ipex/csrc/jit/accelerated_ops.h +++ b/torch_ipex/csrc/jit/accelerated_ops.h @@ -1,6 +1,6 @@ #pragma once -#include "cpu/dil/dil.hpp" +#include #include namespace torch { namespace jit { diff --git a/torch_ipex/csrc/jit/dnnl_ops.h b/torch_ipex/csrc/jit/dnnl_ops.h index abb7de3de..547c03675 100644 --- a/torch_ipex/csrc/jit/dnnl_ops.h +++ b/torch_ipex/csrc/jit/dnnl_ops.h @@ -1,6 +1,6 @@ #pragma once -#include "cpu/dil/dil.hpp" +#include #include #include diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp index ef528e432..9568e1015 100644 --- a/torch_ipex/csrc/jit/fusion_pass.cpp +++ b/torch_ipex/csrc/jit/fusion_pass.cpp @@ -1,7 +1,8 @@ #include -#include "graph_ext.h" #include "fusion_pass.h" -#include "accelerated_ops.h" + +#include "cpu/FusionOPs.h" + #include #include #include @@ -80,85 +81,38 @@ class OpFuser { aliasDb_ = std::make_unique(graph_); } - Node* fuseNodes(Node *curr, Value *path, Rule rule) { - return fuseOpsWithNewKind(curr, path, curr->owningGraph(), rule->second); - } - - // - // currently we only have to fold conv2d + batch_norm - // - bool isFoldable(Node* node, Node* prev) { - bool foldable = (node->kind() == aten::batch_norm - && prev->kind() == aten::conv2d); - // - // Check whether all the sources are constant ??? - // Does performance improve no matter we do it pre-compiling or runtime? - // - - auto* conv2d = reinterpret_cast(prev)->cast(); - auto* batch_norm = reinterpret_cast(node)->cast(); - - foldable = foldable - && conv2d->hasConstantParams() - && batch_norm->hasConstantParams(); - - return foldable; - } - - Node* foldNodes(Node *conv2d, Node *batch_norm) { - // Change weight/bias source - auto* fold_weight = createBatchNormFoldWeight(conv2d, batch_norm); - fold_weight->insertBefore(conv2d); - conv2d->replaceInput(1, fold_weight->output()); + Node* fuseOpsWithNewKind(Node *curr, Value *v, Graph *g, NodeKind kind) { + auto newNode = g->create(kind); + auto prev = v->node(); + newNode->insertBefore(prev); + newNode->setScope(prev->scope()); + newNode->copyAttributes(*prev); - auto* fold_bias = createBatchNormFoldBias(conv2d, batch_norm); - fold_bias->insertBefore(conv2d); - conv2d->replaceInput(2, fold_bias->output()); + for (auto input : prev->inputs()) { + newNode->addInput(input); + } - batch_norm->replaceAllUsesWith(conv2d); - batch_norm->destroy(); - return conv2d; - } + for (auto input : curr->inputs()) { + if (input != v) { + newNode->addInput(input); + } + } - Node* createBatchNormFoldWeight(Node *conv2d, Node *batch_norm) { - auto g = conv2d->owningGraph(); - auto newNode = g->create(dnnl::fold_weight); - newNode->setScope(conv2d->scope()); + // Copy curr or prev? + newNode->output()->copyMetadata(prev->output()); + newNode->output()->setType(prev->output()->type()); - // We need following parameters + v->replaceAllUsesWith(newNode->output()); + curr->replaceAllUsesWith(newNode); - newNode->addInput(conv2d->input(1)); // Conv2d weights - newNode->addInput(batch_norm->input(1)); // Batch norm weights - newNode->addInput(batch_norm->input(4)); // running_var (delta) - newNode->addInput(batch_norm->input(7)); // eps + prev->destroy(); + curr->destroy(); - // We get meta and type from conv2d weight value - newNode->output()->copyMetadata(conv2d->input(1)); - newNode->output()->setType(conv2d->input(1)->type()); - newNode->output()->setDebugName(conv2d->input(1)->debugName() + ".bn_folded"); return newNode; } - Node* createBatchNormFoldBias(Node *conv2d, Node *batch_norm) { - auto g = conv2d->owningGraph(); - auto newNode = g->create(dnnl::fold_bias); - newNode->setScope(conv2d->scope()); - - // We need following information - newNode->addInput(conv2d->input(1)); // Conv weight - newNode->addInput(conv2d->input(2)); // Conv bias - newNode->addInput(batch_norm->input(1)); // batch norm weight - newNode->addInput(batch_norm->input(2)); // batch norm bias - newNode->addInput(batch_norm->input(3)); // running_mean (mu) - newNode->addInput(batch_norm->input(4)); // running_var (delta) - newNode->addInput(batch_norm->input(7)); // eps - - // We get meta and type from conv2d bias value - newNode->output()->copyMetadata(conv2d->input(2)); - newNode->output()->setType(conv2d->input(2)->type()); - newNode->output()->setDebugName(conv2d->input(2)->debugName() + ".bn_folded"); - - return newNode; + Node* fuseNodes(Node *curr, Value *path, Rule rule) { + return fuseOpsWithNewKind(curr, path, curr->owningGraph(), rule->second); } bool aliasIsSafeForSquashingValue(Node *node, Value *v) { @@ -199,7 +153,6 @@ class OpFuser { } // throw - //auto er = script::ErrorReport(node->sourceRange()); auto er = ErrorReport(node->sourceRange()); er << "Schema not found for fusion process. \n"; er << "Prev: " << *prev << "\n"; @@ -297,52 +250,40 @@ class OpFuser { } std::pair processNode(Node *node) { - auto nodeExt = reinterpret_cast(node); Node* pos = node; bool changed = false; - if (nodeExt->isDNNLOps()) { - // - // Check whether we could fuse to one certain value path - // - for (auto *v : node->inputs()) { - auto prev = v->node(); - auto fuseRule = isFusable(node, prev); - - // We can fuse only one path - if (fuseRule && aliasIsSafeForFusion(node, v, fuseRule)) { - pos = fuseNodes(node, v, fuseRule.value()); - changed = true; - break; - } else if (isFoldable(node, prev) - && aliasIsSafeForSquashingValue(node, v)) { - pos = foldNodes(prev, node); - changed = true; - break; - } + // + // Check whether we could fuse to one certain value path + // + for (auto *v : node->inputs()) { + auto prev = v->node(); + auto fuseRule = isFusable(node, prev); + + // We can fuse only one path + if (fuseRule && aliasIsSafeForFusion(node, v, fuseRule)) { + pos = fuseNodes(node, v, fuseRule.value()); + changed = true; + break; } } - return std::make_pair(++pos->iterator(), changed); } + }; // TODO: These rules should be more scalable OpFuser::RuleTab OpFuser::dnnlRules = { - {{dnnl::conv2d, dnnl::relu}, dnnl::conv2d_relu}, - {{dnnl::conv2d, dnnl::relu_}, dnnl::conv2d_relu}, - /* - {{dnnl::batch_norm, dnnl::relu}, dnnl::batch_norm_relu}, - {{dnnl::batch_norm, dnnl::relu_}, dnnl::batch_norm_relu}, - */ + {{aten::conv2d, aten::relu}, dnnl::conv2d_relu}, + {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, dnnl::conv2d_relu}, /* - {{dnnl::conv2d_sum, dnnl::relu}, dnnl::conv2d_sum_relu}, - {{dnnl::conv2d_sum, dnnl::relu_}, dnnl::conv2d_sum_relu}, + {{AtenIpexCPUDev::conv2d_sum, AtenIpexCPUDev::relu}, AtenIpexCPUDev::conv2d_sum_relu}, + {{AtenIpexCPUDev::conv2d_sum, dnnl::relu_}, AtenIpexCPUDev::conv2d_sum_relu}, - {{dnnl::conv2d, dnnl::sum}, dnnl::conv2d_sum}, - {{dnnl::conv2d, dnnl::sum_}, dnnl::conv2d_sum}, - {{dnnl::conv2d_relu, dnnl::sum}, dnnl::conv2d_relu_sum} + {{aten::conv2d, aten::add}, AtenIpexCPUDev::conv2d_sum}, + {{aten::conv2d, aten::add_}, AtenIpexCPUDev::conv2d_sum}, + {{AtenIpexCPUDev::conv2d_relu, aten::add}, AtenIpexCPUDev::conv2d_relu_sum} */ }; @@ -355,4 +296,5 @@ void FusionPass(std::shared_ptr &graph) { // TODO: Some post processing?? ECS/EDC/Peephole??? ConstantPropagation(graph); } + }} // namespace torch::jit diff --git a/torch_ipex/csrc/jit/graph_ext.cpp b/torch_ipex/csrc/jit/graph_ext.cpp index 658aa4011..efbec2cf8 100644 --- a/torch_ipex/csrc/jit/graph_ext.cpp +++ b/torch_ipex/csrc/jit/graph_ext.cpp @@ -2,7 +2,6 @@ #include "accelerated_ops.h" namespace torch { namespace jit { -/* void NodeExt::initFormatInfo() { std::vector formatInfo ( this->inputs().size() + this->outputs().size(), @@ -10,7 +9,7 @@ void NodeExt::initFormatInfo() { this->is_(attr::format_info, std::move(formatInfo)); } -*/ + const std::vector& NodeExt::getFormatInfo() const { return this->is(attr::format_info); } @@ -41,7 +40,6 @@ void NodeExt::setGroupInfo(int64_t groups) { this->i_(attr::group_info, groups); } -/* Node *NodeExt::createReorder(Value *v, Graph *g, formatTag from, formatTag to) { NodeExt *reorder = nullptr; if (from != to) { @@ -108,7 +106,7 @@ Node* NodeExt::appendReorder(formatTag to, int i) { return reorder; } -*/ + void NodeExt::propagateFormats() { // TODO: Need consultant with acceleration libraries setOutputFormat(inputFormat()); @@ -187,7 +185,7 @@ bool Conv2dNode::hasConstantParams() const { return has; } -/* + formatTag Conv2dNode::expectedWeightFormat( c10::ArrayRef sizes, c10::List stride, @@ -223,9 +221,8 @@ void Conv2dNode::fixWeightFormatIfPossible() { this->prependReorders(use_list {{this, 1}}, {natureWeightFormat}, {groups}); } } -*/ + bool BatchNorm2dNode::hasConstantParams() const { - /* bool has = this->input(1)->node()->kind() == prim::Constant && this->input(2)->node()->kind() == prim::Constant @@ -236,8 +233,6 @@ bool BatchNorm2dNode::hasConstantParams() const { // TODO: more check to make sure return has; - */ - return true; } }} // namespace torch::jit diff --git a/torch_ipex/csrc/jit/graph_ext.h b/torch_ipex/csrc/jit/graph_ext.h index 74f762a37..34a141854 100644 --- a/torch_ipex/csrc/jit/graph_ext.h +++ b/torch_ipex/csrc/jit/graph_ext.h @@ -2,23 +2,21 @@ #include #include - -#include "cpu/dil/dil.hpp" +#include #include "accelerated_ops.h" - #include #include #include namespace torch { namespace jit { -using namespace dil; -using dataType = dil::tensor::data_type; -using formatTag = dil::format_tag; +using namespace ideep; +using dataType = ideep::tensor::data_type; +using formatTag = ideep::format; using formatList = std::vector; using groupsList = std::vector; -//static constexpr auto natureFormat = formatTag::nchw; -//static constexpr auto natureWeightFormat = formatTag::oihw; +static constexpr auto natureFormat = formatTag::nchw; +static constexpr auto natureWeightFormat = formatTag::oihw; // attributes for pyrys ops to decide which format is on // Or what formats transfered by reorder @@ -73,7 +71,7 @@ class NodeExt : public Node { return this->kind() == dnnl::batch_norm; } - //void initFormatInfo(); + void initFormatInfo(); template T* cast() { return reinterpret_cast(this); @@ -81,19 +79,18 @@ class NodeExt : public Node { private: // we save formats as Ints attribute internally const std::vector& getFormatInfo() const; - /* + static Node* createReorder( Value *v, Graph *g, formatTag from, formatTag to); static Node* insertReorder( Value *v, Node *insert_point, formatTag from, formatTag to); - */ }; class Conv2dNode : public NodeExt { public: bool couldInferFormats() const; bool hasConstantParams() const; - //void fixWeightFormatIfPossible(); + void fixWeightFormatIfPossible(); formatTag expectedWeightFormat( c10::ArrayRef sizes, c10::List stride, @@ -102,7 +99,6 @@ class Conv2dNode : public NodeExt { int64_t groups, dataType dtype = dataType::f32) const; }; - class BatchNorm2dNode : public NodeExt { public: bool hasConstantParams() const; diff --git a/torch_ipex/csrc/jit/init.cpp b/torch_ipex/csrc/jit/init.cpp index 3316d4b4f..487e7019e 100644 --- a/torch_ipex/csrc/jit/init.cpp +++ b/torch_ipex/csrc/jit/init.cpp @@ -7,22 +7,19 @@ #include #include "accelerated_ops.h" -//#include "op_rewrite.h" -//#include "format_analysis.h" +#include "op_rewrite.h" +#include "format_analysis.h" #include "fusion_pass.h" -//#include "dnnl_ops.h" +#include "dnnl_ops.h" namespace py = pybind11; using namespace torch::jit; -//static bool jit_enabled = false; - -static bool jit_enabled = true; +static bool pyrys_enabled = false; PYBIND11_MODULE(pyrys, m) { m.doc() = "A DO fusion backend for Pytorch JIT"; - /* RegisterPass pass_1([](std::shared_ptr& g) { if (pyrys_enabled) { torch::jit::OpRewritePass(g); @@ -33,17 +30,14 @@ PYBIND11_MODULE(pyrys, m) { torch::jit::FormatOptimize(g); } }); - */ RegisterPass pass_3([](std::shared_ptr& g) { - if (jit_enabled) { - std::cout << "in init\n"; + if (pyrys_enabled) { torch::jit::FusionPass(g); } }); - m.def("enable", []() { jit_enabled = true; }); - m.def("disable", []() { jit_enabled = false; }); - /* + m.def("enable", []() { pyrys_enabled = true; }); + m.def("disable", []() { pyrys_enabled = false; }); m.def("dnnl_conv2d", at::native::dnnl_conv2d, "A conv2d function of dnnl"); m.def("dnnl_conv2d_relu", at::native::dnnl_conv2d_relu, "A conv2d_relu function of dnnl"); m.def("dnnl_relu", at::native::dnnl_relu, "A relu function of dnnl"); @@ -51,5 +45,4 @@ PYBIND11_MODULE(pyrys, m) { m.def("dnnl_batch_norm", at::native::dnnl_batch_norm, "A batch_norm function of dnnl"); m.def("dnnl_pooling_max_2d", at::native::dnnl_pooling_max_2d, "A max-pooling-2d funtion of dnnl"); m.def("dnnl_pooling_avg_2d", at::native::dnnl_pooling_avg_2d, "An avg-pooling-2d funtion of dnnl"); - */ } diff --git a/torch_ipex/csrc/jit/op_rewrite.cpp b/torch_ipex/csrc/jit/op_rewrite.cpp index 8a5a65efe..481cba542 100644 --- a/torch_ipex/csrc/jit/op_rewrite.cpp +++ b/torch_ipex/csrc/jit/op_rewrite.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include "graph_ext.h" #include "op_rewrite.h" @@ -24,7 +24,7 @@ NodeExt* replaceOpWithDNNL(Node *node, Graph *g) { auto* replacement = reinterpret_cast( replaceOpWithNewKind(node, g, rules.at(node->kind()))); - //replacement->initFormatInfo(); + replacement->initFormatInfo(); return replacement; } @@ -47,11 +47,10 @@ void OpRewritePass(Block *block) { // need a reorder to transform it back // auto newNode = replaceOpWithDNNL(node, block->owningGraph()); - /* auto conv2d = newNode->cast(); conv2d->fixWeightFormatIfPossible(); conv2d->appendReorder(natureFormat); - */ + // If we could get more information about the weights // We could prepend a reorder for the weights and constant propagation // might help us create a MKL-DNN friendly weight @@ -63,24 +62,20 @@ void OpRewritePass(Block *block) { // auto lh_node = node->input(0)->node(); auto rh_node = node->input(1)->node(); - /* auto by_pass_reorder = [](const Node *n) { return (n->kind() == dnnl::reorder) ? n->input()->node() : n; }; - */ + // // higher priority for conv+sum fusion than other kind // possibly we check whether there is a chance for conv+sum+relu // - /* if (by_pass_reorder(lh_node)->kind() == dnnl::conv2d || by_pass_reorder(rh_node)->kind() == dnnl::conv2d || by_pass_reorder(lh_node)->kind() == dnnl::batch_norm || by_pass_reorder(rh_node)->kind() == dnnl::batch_norm) replaceOpWithDNNL(node, block->owningGraph()); - */ - replaceOpWithDNNL(node, block->owningGraph()); } else if (node->matches("aten::relu(Tensor self) -> Tensor") || node->matches("aten::relu_(Tensor(a!) self) -> Tensor(a!)") || node->matches( @@ -107,10 +102,10 @@ void OpRewritePass(Block *block) { } auto newNode = replaceOpWithDNNL(node, block->owningGraph()); - //newNode->appendReorder(natureFormat); + newNode->appendReorder(natureFormat); } else if (node->matches("aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride=[], int[] padding, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor")) { auto newNode = replaceOpWithDNNL(node, block->owningGraph()); - //newNode->appendReorder(natureFormat); + newNode->appendReorder(natureFormat); } } } diff --git a/torch_ipex/csrc/jit/op_rewrite.h b/torch_ipex/csrc/jit/op_rewrite.h index 2c1ab26a8..cac609ba7 100644 --- a/torch_ipex/csrc/jit/op_rewrite.h +++ b/torch_ipex/csrc/jit/op_rewrite.h @@ -1,7 +1,7 @@ #pragma once #include -#include "cpu/dil/dil.hpp" +#include #include namespace torch { namespace jit { diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp index 9417ac716..26760b9df 100644 --- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp +++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp @@ -1,9 +1,10 @@ -#include "torch/csrc/jit/runtime/operator.h" -#include "torch/csrc/jit/runtime/custom_operator.h" -#include "accelerated_ops.h" -#include "graph_ext.h" -#include "cpu/DevOPs.h" -//#include "dnnl_ops.h" +#include + +#include +#include + +#include "torch_ipex/csrc/utils.h" +#include "cpu/FusionOPs.h" namespace torch { @@ -23,198 +24,36 @@ at::Tensor toOptionalTensor(const IValue& v) { using namespace torch_ipex::cpu; RegisterOperators op({ - Operator( - "dnnl::reorder(Tensor self) -> Tensor", - [](const Node* node) -> Operation { - return [node] (Stack& stack) { - auto* enode = reinterpret_cast(node); - auto from = enode->inputFormat(0); - auto to = enode->inputFormat(1); - auto groups = enode->getGroupInfo(); - - // auto result = dnnl_reorder( - // (std::move(peek(stack, 0, 1))).toTensor(), from, to, groups); - auto result = at::Tensor(); - drop(stack, 1); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::relu(Tensor self) -> Tensor", - [](const Node* node) -> Operation { - return [] (Stack& stack) { - auto result = AtenIpexCPUDev::dil_relu((std::move(peek(stack, 0, 1))).toTensor()); - drop(stack, 1); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::relu_(Tensor(a!) self) -> Tensor(a!)", - [] (const Node* node) -> Operation { - return [] (Stack& stack) { - at::Tensor input; - pop(stack, input); - auto result = AtenIpexCPUDev::dil_relu_(input); - push(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", - [] (const Node* node) -> Operation { - return [] (Stack& stack) { - auto result = AtenIpexCPUDev::dil_convolution( - (std::move(peek(stack, 0, 7))).toTensor(), - (std::move(peek(stack, 1, 7))).toTensor(), - toOptionalTensor(std::move(peek(stack, 2, 7))), - (std::move(peek(stack, 3, 7))).toIntVector(), - (std::move(peek(stack, 4, 7))).toIntVector(), - (std::move(peek(stack, 5, 7))).toIntVector(), - (std::move(peek(stack, 6, 7))).toInt()); - drop(stack, 7); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), Operator( "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", [] (const Node* node) ->Operation { - return [] (Stack& stack) { - auto result = AtenIpexCPUDev::dil_convolution_relu( - (std::move(peek(stack, 0, 7))).toTensor(), - (std::move(peek(stack, 1, 7))).toTensor(), - toOptionalTensor(std::move(peek(stack, 2, 7))), - (std::move(peek(stack, 3, 7))).toIntVector(), - (std::move(peek(stack, 4, 7))).toIntVector(), - (std::move(peek(stack, 5, 7))).toIntVector(), - (std::move(peek(stack, 6, 7))).toInt()); - drop(stack, 7); - pack(stack, std::move(result)); - return 0; - }; + if (torch_ipex::check_auto_dnnl()) { + return [] (Stack& stack) { + auto result = AtenIpexJITDev::dil_convolution_relu( + (std::move(peek(stack, 0, 7))).toTensor(), + (std::move(peek(stack, 1, 7))).toTensor(), + toOptionalTensor(std::move(peek(stack, 2, 7))), + (std::move(peek(stack, 3, 7))).toIntVector(), + (std::move(peek(stack, 4, 7))).toIntVector(), + (std::move(peek(stack, 5, 7))).toIntVector(), + (std::move(peek(stack, 6, 7))).toInt()); + drop(stack, 7); + pack(stack, std::move(result)); + return 0; + }; + } else { + TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now") + } }, aliasAnalysisFromSchema() - ), - Operator( - "dnnl::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", - [] (const Node* node) ->Operation { - return [] (Stack& stack) { - /* - auto result = dnnl_batch_norm( - (std::move(peek(stack, 0, 9))).toTensor(), - toOptionalTensor(std::move(peek(stack, 1, 9))), - toOptionalTensor(std::move(peek(stack, 2, 9))), - toOptionalTensor(std::move(peek(stack, 3, 9))), - toOptionalTensor(std::move(peek(stack, 4, 9))), - (std::move(peek(stack, 5, 9))).toBool(), - (std::move(peek(stack, 6, 9))).toDouble(), - (std::move(peek(stack, 7, 9))).toDouble(), - (std::move(peek(stack, 8, 9))).toBool()); - */ - auto result = at::Tensor(); - drop(stack, 9); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::fold_weight(Tensor weight, Tensor? bn_weight, Tensor? running_var, float eps) -> Tensor", - [] (const Node* node) -> Operation { - return [] (Stack& stack) { - /* - auto result = dnnl_fold_weight( - (std::move(peek(stack, 0, 4))).toTensor(), - toOptionalTensor(std::move(peek(stack, 1, 4))), - toOptionalTensor(std::move(peek(stack, 2, 4))), - (std::move(peek(stack, 3, 4))).toDouble()); - */ - auto result = at::Tensor(); - drop(stack, 4); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::fold_bias(Tensor weight, Tensor? bias, Tensor? bn_weight, Tensor? bn_bias, Tensor? running_mean, Tensor? running_var, float eps) -> Tensor", - [] (const Node* node) -> Operation{ - return [] (Stack& stack) { - /* - auto result = dnnl_fold_bias( - (std::move(peek(stack, 0, 7))).toTensor(), - toOptionalTensor(std::move(peek(stack, 1, 7))), - toOptionalTensor(std::move(peek(stack, 2, 7))), - toOptionalTensor(std::move(peek(stack, 3, 7))), - toOptionalTensor(std::move(peek(stack, 4, 7))), - toOptionalTensor(std::move(peek(stack, 5, 7))), - (std::move(peek(stack, 6, 7))).toDouble()); - */ - auto result = at::Tensor(); - drop(stack, 7); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::sum(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", - [] (const Node* node) ->Operation { - return [] (Stack& stack) { - /* - auto result = dnnl_sum( - (std::move(peek(stack, 0, 3))).toTensor(), - (std::move(peek(stack, 1, 3))).toTensor(), - (std::move(peek(stack, 2, 3))).toScalar() - ); - */ - auto result = at::Tensor(); - drop(stack, 3); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::sum_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", - [] (const Node* node) ->Operation{ - return [](Stack &stack) { - auto self = (std::move(peek(stack, 0, 3))).toTensor(); - /* - auto result = dnnl_sum_( - self, - (std::move(peek(stack, 1, 3))).toTensor(), - (std::move(peek(stack, 2, 3))).toScalar()); - */ - auto result = at::Tensor(); - drop(stack, 3); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), + ) /* Operator( "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)", [] (const Node* node) ->Operation { return [] (Stack& stack) { auto output = (std::move(peek(stack, 7, 9))).toTensor(); - auto result = dnnl_conv2d_sum( + auto result = AtenIpexCPUDev::conv2d_sum( (std::move(peek(stack, 0, 9))).toTensor(), (std::move(peek(stack, 1, 9))).toTensor(), toOptionalTensor(std::move(peek(stack, 2, 9))), @@ -238,7 +77,7 @@ RegisterOperators op({ [] (const Node* node) ->Operation { return [] (Stack& stack) { auto output = (std::move(peek(stack, 7, 9))).toTensor(); - auto result = dnnl_conv2d_sum_relu( + auto result = AtenIpexCPUDev::conv2d_sum_relu( (std::move(peek(stack, 0, 9))).toTensor(), (std::move(peek(stack, 1, 9))).toTensor(), toOptionalTensor(std::move(peek(stack, 2, 9))), @@ -257,47 +96,6 @@ RegisterOperators op({ }, aliasAnalysisFromSchema() ),*/ - Operator( - "dnnl::pooling_max_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=0) -> Tensor(a!)", - [] (const Node *node) ->Operation { - return [] (Stack& stack) { - /* - auto result = dnnl_pooling_max_2d( - (std::move(peek(stack, 0, 6))).toTensor(), // Input tensor - (std::move(peek(stack, 1, 6))).toIntVector(), // Kernel size - (std::move(peek(stack, 2, 6))).toIntVector(), // Stride - (std::move(peek(stack, 3, 6))).toIntVector(), // Padding - (std::move(peek(stack, 4, 6))).toIntVector(), // Dilation - (std::move(peek(stack, 5, 6))).toBool()); // Ceil mode - */ - auto result = at::Tensor(); - drop(stack, 6); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), - Operator( - "dnnl::pooling_avg_2d(Tensor input, int[2] kernel_size, int[2] stride=1, int[2] padding=0, bool ceil_mode=0, bool count_include_pad=True, int? divisor_override=None) -> Tensor(a!)", - [] (const Node *node) ->Operation { - return [] (Stack& stack) { - /* - auto result = dnnl_pooling_avg_2d( - (std::move(peek(stack, 0, 7))).toTensor(), // Input tensor - (std::move(peek(stack, 1, 7))).toIntVector(), // Kernel size - (std::move(peek(stack, 2, 7))).toIntVector(), // Stride - (std::move(peek(stack, 3, 7))).toIntVector(), // Padding - (std::move(peek(stack, 4, 7))).toBool()); // Ceil mode - */ - auto result = at::Tensor(); - drop(stack, 7); - pack(stack, std::move(result)); - return 0; - }; - }, - aliasAnalysisFromSchema() - ), }); } } From c08064abfa10dc617d15dbee4e536f4dd4cb9c98 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Wed, 20 May 2020 14:45:05 +0800 Subject: [PATCH 03/10] jit: enable conv_sum and conc_sum_relu fusion --- tests/cpu/test_jit.py | 101 ++++++++++++++---- torch_ipex/csrc/cpu/FusionOPs.cpp | 72 +++++++++++++ torch_ipex/csrc/cpu/FusionOPs.h | 15 +-- torch_ipex/csrc/cpu/dbl/Common.cpp | 2 - torch_ipex/csrc/cpu/dbl/Conv.cpp | 61 +++++++++++ torch_ipex/csrc/cpu/dbl/Conv.h | 11 ++ torch_ipex/csrc/jit/fusion_pass.cpp | 18 ++-- torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 91 ++++++++-------- 8 files changed, 290 insertions(+), 81 deletions(-) diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py index 42142cdc8..f585da98d 100644 --- a/tests/cpu/test_jit.py +++ b/tests/cpu/test_jit.py @@ -56,6 +56,7 @@ import torch import torch.nn as nn +from torch.jit._recursive import wrap_cpp_module import copy import intel_pytorch_extension @@ -82,29 +83,89 @@ torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) -class Conv_relu(nn.Module): - def __init__(self): - super(Conv_relu, self).__init__() +def test_output(model, x): + modelName = model.__class__.__name__ + core.disable_jit() + + model = model.to('dpcpp').eval() + x = x.to('dpcpp') + with torch.no_grad(): + result = model(x) + + smodel = torch.jit.script(model) + smodel.eval() + with torch.no_grad(): + sresult = smodel(x) + + print(f'\nAre {modelName} and Scripted{modelName} outputs the same: ', + torch.allclose( + sresult, result, rtol=1e-05, atol=1e-06, equal_nan=False)) + + core.enable_jit() + pmodel = torch.jit.script(model) + # bn folding + pmodel = wrap_cpp_module(torch._C._jit_pass_fold_convbn(pmodel._c)) + with torch.no_grad(): + # conv relu fusion, conv sum fusion or conv sum relu fusion + print(pmodel.graph_for(x)) + presult = pmodel(x) + + # print(result) + # print(sresult) + # print(presult) + + print(f'\nWith or without pyrys, are Scripted{modelName} outputs the same: ', + torch.allclose( + sresult, presult, rtol=1e-05, atol=1e-06, equal_nan=False)) + +class Conv2dRelu_Fixed(nn.Module): + def __init__(self, in_channels, out_channels, **kwargs): + super(Conv2dRelu_Fixed, self).__init__() + seed = 2018 + torch.manual_seed(seed) + self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) + + def forward(self, x): + return F.relu(self.conv(x), inplace=True) + +class CascadedConv2dBnSumRelu(nn.Module): + def __init__(self, in_channels, mid_channels, out_channels, **kwargs): + super(CascadedConv2dBnSumRelu, self).__init__() torch.manual_seed(2018) - self.conv = torch.nn.Conv2d(20, 20, 5) + self.conv = nn.Conv2d(in_channels, mid_channels, bias=False, **kwargs) + self.conv1 = nn.Conv2d( + mid_channels, out_channels, bias=False, padding=1, **kwargs) + self.conv2 = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) + self.bn = nn.BatchNorm2d(mid_channels, eps=0.001) + self.bn1 = nn.BatchNorm2d(out_channels, eps=0.001) + self.bn2 = nn.BatchNorm2d(out_channels, eps=0.001) def forward(self, x): - x = self.conv(x) - return x.relu() - -class TestJITOP(TestCase): - def test_conv_relu_fusion(self): - x = torch.randn(1, 20, 20, 20).to('dpcpp') - - model = Conv_relu().to('dpcpp').eval() - - with torch.no_grad(): - core.disable_jit() - y1 = model(x) - core.enable_jit() - script_model = torch.jit.script(model) - y2 = script_model(x) - self.assertEqual(y1, y2) + a = self.conv(x) + a = self.bn(a) + a = F.relu(a, inplace=True) + a = self.conv1(a) + a = self.bn1(a) + b = self.conv2(x) + b = self.bn2(b) + return F.relu(a.add_(b), inplace=True) + +class Tester(TestCase): + n = 32 + c = 3 + h = 224 + w = 224 + print('input size: (%d, %d, %d, %d)' % (n, c, h, w)) + + def test_output_conv_relu(self): + test_output( + Conv2dRelu_Fixed(self.c, 32, kernel_size=3, stride=1), + torch.rand(self.n, self.c, self.h, self.w)) + + def test_output_cascaded_conv2d_bn_sum_relu(self): + test_output( + CascadedConv2dBnSumRelu(self.c, 64, 32, kernel_size=3, stride=1), + torch.rand(self.n, self.c, self.h, self.w)) if __name__ == '__main__': core.enable_auto_dnnl() diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp index aa06f5105..c87c0940e 100644 --- a/torch_ipex/csrc/cpu/FusionOPs.cpp +++ b/torch_ipex/csrc/cpu/FusionOPs.cpp @@ -55,5 +55,77 @@ at::Tensor AtenIpexJITDev::dil_convolution_relu( return dbl::comm::gen_aten_tensor_by(dil_output); } +static at::Tensor& dil_convolution_inplace_fusion( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& bias, + at::Tensor& accumu, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + int64_t groups, + const dil::attr_t& attr) { + dil::tensor dil_input; + dil::tensor dil_weight; + dil::tensor dil_output; + c10::optional dil_bias{c10::nullopt}; + + auto input_contiguous = input.contiguous(); + auto weight_contiguous = weight.contiguous(); + auto output_contiguous = accumu.contiguous(); + + dil_input = dbl::comm::try_gen_dil_tensor(input_contiguous); + dil_weight = dbl::comm::try_gen_dil_tensor(weight_contiguous); + dil_output = dbl::comm::try_gen_dil_tensor(output_contiguous); + if (bias.defined()) { + auto bias_contiguous = bias.contiguous(); + dil_bias = dbl::comm::try_gen_dil_tensor(bias_contiguous); + } + + dbl::conv::conv2d_inplace_impl( + dil_input, + dil_weight, + dil_bias, + dil_output, + padding, + stride, + dilation, + groups, + attr); + + dbl::comm::sync_shape_from_dil_to_aten(accumu, dil_output); + return accumu; +} + +at::Tensor& AtenIpexJITDev::dil_convolution_sum( + const at::Tensor & input, + const at::Tensor & weight, + const at::Tensor & bias, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + int64_t groups, + at::Tensor& accumu, + at::Scalar alpha) { + auto scale = alpha.to(); + return dil_convolution_inplace_fusion(input, weight, bias, accumu, stride, padding, + dilation, groups, dil::attr_t::fuse_sum(scale)); +} + +at::Tensor& AtenIpexJITDev::dil_convolution_sum_relu( + const at::Tensor & input, + const at::Tensor & weight, + const at::Tensor & bias, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + int64_t groups, + at::Tensor& accumu, + at::Scalar alpha) { + auto scale = alpha.to(); + return dil_convolution_inplace_fusion(input, weight, bias, accumu, stride, padding, + dilation, groups, dil::attr_t::residual(scale)); +} + } // namespace cpu } // namespace torch_ipex diff --git a/torch_ipex/csrc/cpu/FusionOPs.h b/torch_ipex/csrc/cpu/FusionOPs.h index 14f3db7e1..dcab1ea66 100644 --- a/torch_ipex/csrc/cpu/FusionOPs.h +++ b/torch_ipex/csrc/cpu/FusionOPs.h @@ -11,12 +11,11 @@ namespace torch { namespace jit { // XXX: PyTorch does not support nesting namespace // And the alias analysis is not working for namespace other than aten ... // So we fake some op namespaces to workaround that. -namespace dnnl { - static auto conv2d_relu = Symbol::fromQualString("dnnl::conv2d_relu"); - static auto conv2d_sum = Symbol::fromQualString("dnnl::conv2d_sum"); - static auto conv2d_relu_sum = Symbol::fromQualString("dnnl::conv2d_relu_sum"); - static auto conv2d_sum_relu = Symbol::fromQualString("dnnl::conv2d_sum_relu"); - +namespace ipex { + static auto conv2d_relu = Symbol::fromQualString("ipex::conv2d_relu"); + static auto conv2d_sum = Symbol::fromQualString("ipex::conv2d_sum"); + static auto conv2d_relu_sum = Symbol::fromQualString("ipex::conv2d_relu_sum"); + static auto conv2d_sum_relu = Symbol::fromQualString("ipex::conv2d_sum_relu"); } }} // namespace torch::jit @@ -29,6 +28,10 @@ class AtenIpexJITDev { // for JIT ops static at::Tensor dil_convolution_relu(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); + static at::Tensor& dil_convolution_sum(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor& accumu, at::Scalar alpha); + + static at::Tensor& dil_convolution_sum_relu( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor& accumu, at::Scalar alpha); + }; } // namespace cpu diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp index 3be05955d..913643ce3 100644 --- a/torch_ipex/csrc/cpu/dbl/Common.cpp +++ b/torch_ipex/csrc/cpu/dbl/Common.cpp @@ -91,7 +91,6 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso dil::dims sizes = dil_tensor.get_dims(); if (dil_tensor.is_public_format()) { dil::dims strides = dil_tensor.get_strides(); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.device().type() == at::DeviceType::DPCPP); auto* _tensor_impl = (IPEXTensorImpl *)ipex_tensor.unsafeGetTensorImpl(); _tensor_impl->force_set_strided(sizes, strides); } else { @@ -99,7 +98,6 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.size() != 1 || sizes[0] != 0); ipex_tensor.unsafeGetTensorImpl()->set_sizes_contiguous(sizes); } - } } // namespace comm diff --git a/torch_ipex/csrc/cpu/dbl/Conv.cpp b/torch_ipex/csrc/cpu/dbl/Conv.cpp index 9dadfdebd..c3fce71ca 100644 --- a/torch_ipex/csrc/cpu/dbl/Conv.cpp +++ b/torch_ipex/csrc/cpu/dbl/Conv.cpp @@ -86,6 +86,67 @@ dil::tensor conv2d_impl( return y; } +void conv2d_inplace_impl( + const dil::tensor& x, + const dil::tensor& w, + const c10::optional& b, + dil::tensor& y, + at::IntArrayRef padding, + at::IntArrayRef stride, + at::IntArrayRef dilation, + int64_t groups, + const dil::attr_t& attr) { + std::vector kernel_size(x.ndims()); + // mkldnn conv2d weights could have been re-ordered to 5d by + // mkldnn_reorder_conv2d_weight + if (w.ndims() == x.ndims() + 1) { + AT_ASSERTM( + groups > 1, + "Only group _mkldnn_conv2d weights could have been reordered to 5d"); + kernel_size[0] = w.get_dim(0) * w.get_dim(1); + std::copy_n(w.get_dims().cbegin() + 2, x.ndims() - 1, kernel_size.begin() + 1); + } else { + std::copy_n(w.get_dims().cbegin(), x.ndims(), kernel_size.begin()); + } + + const dil::dims x_dims = x.get_dims(); + std::vector input_size{x_dims.cbegin(), x_dims.cend()}; + std::vector output_sizes = calc_conv_output_size(input_size, kernel_size, padding, stride, dilation); + + if (b.has_value()) { + dil::convolution_forward::compute( + x, + w, + b.value(), + {output_sizes.cbegin(), output_sizes.cend()}, + y, + {stride.begin(), stride.end()}, + {dilation.begin(), dilation.end()}, + {padding.begin(), padding.end()}, + {padding.begin(), padding.end()}, + groups, + dil::scale_t(), + dil::scale_t(), + dil::scale_t(), + attr); + } else { + dil::convolution_forward::compute( + x, + w, + {output_sizes.cbegin(), output_sizes.cend()}, + y, + {stride.begin(), stride.end()}, + {dilation.begin(), dilation.end()}, + {padding.begin(), padding.end()}, + {padding.begin(), padding.end()}, + groups, + dil::scale_t(), + dil::scale_t(), + dil::scale_t(), + attr); + } +} + } // namespace conv } // namespace dbl } // namespace cpu diff --git a/torch_ipex/csrc/cpu/dbl/Conv.h b/torch_ipex/csrc/cpu/dbl/Conv.h index e4d41aa33..5f954f330 100644 --- a/torch_ipex/csrc/cpu/dbl/Conv.h +++ b/torch_ipex/csrc/cpu/dbl/Conv.h @@ -28,6 +28,17 @@ dil::tensor conv2d_impl( int64_t groups, const dil::attr_t& attr = dil::attr_t()); +void conv2d_inplace_impl( + const dil::tensor& x, + const dil::tensor& w, + const c10::optional& b, + dil::tensor& y, + at::IntArrayRef padding, + at::IntArrayRef stride, + at::IntArrayRef dilation, + int64_t groups, + const dil::attr_t& attr = dil::attr_t()); + } // namespace conv } // namespace dbl } // namespace cpu diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp index 9568e1015..2661c7844 100644 --- a/torch_ipex/csrc/jit/fusion_pass.cpp +++ b/torch_ipex/csrc/jit/fusion_pass.cpp @@ -275,16 +275,14 @@ class OpFuser { // TODO: These rules should be more scalable OpFuser::RuleTab OpFuser::dnnlRules = { - {{aten::conv2d, aten::relu}, dnnl::conv2d_relu}, - {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, dnnl::conv2d_relu}, - /* - {{AtenIpexCPUDev::conv2d_sum, AtenIpexCPUDev::relu}, AtenIpexCPUDev::conv2d_sum_relu}, - {{AtenIpexCPUDev::conv2d_sum, dnnl::relu_}, AtenIpexCPUDev::conv2d_sum_relu}, - - {{aten::conv2d, aten::add}, AtenIpexCPUDev::conv2d_sum}, - {{aten::conv2d, aten::add_}, AtenIpexCPUDev::conv2d_sum}, - {{AtenIpexCPUDev::conv2d_relu, aten::add}, AtenIpexCPUDev::conv2d_relu_sum} - */ + {{aten::conv2d, aten::relu}, ipex::conv2d_relu}, + {{aten::conv2d, Symbol::fromQualString("aten::relu_")}, ipex::conv2d_relu}, + {{ipex::conv2d_sum, aten::relu}, ipex::conv2d_sum_relu}, + {{ipex::conv2d_sum, Symbol::fromQualString("aten::relu_")}, ipex::conv2d_sum_relu}, + + {{aten::conv2d, aten::add}, ipex::conv2d_sum}, + {{aten::conv2d, aten::add_}, ipex::conv2d_sum}, + //{{dnnl::conv2d_relu, aten::add}, dnnl::conv2d_relu_sum} }; void FusionPass(std::shared_ptr &graph) { diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp index 26760b9df..bcfe69c27 100644 --- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp +++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp @@ -25,7 +25,7 @@ using namespace torch_ipex::cpu; RegisterOperators op({ Operator( - "dnnl::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", + "ipex::conv2d_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor", [] (const Node* node) ->Operation { if (torch_ipex::check_auto_dnnl()) { return [] (Stack& stack) { @@ -42,60 +42,65 @@ RegisterOperators op({ return 0; }; } else { - TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now") + TORCH_CHECK(false, "PyTorch native path not support convolution relu fusion now"); } }, aliasAnalysisFromSchema() - ) - /* + ), Operator( - "dnnl::conv2d_sum(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)", + "ipex::conv2d_sum(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups, Tensor(a!) accumu, *, Scalar alpha) -> Tensor(a!)", [] (const Node* node) ->Operation { - return [] (Stack& stack) { - auto output = (std::move(peek(stack, 7, 9))).toTensor(); - auto result = AtenIpexCPUDev::conv2d_sum( - (std::move(peek(stack, 0, 9))).toTensor(), - (std::move(peek(stack, 1, 9))).toTensor(), - toOptionalTensor(std::move(peek(stack, 2, 9))), - (std::move(peek(stack, 3, 9))).toIntVector(), - (std::move(peek(stack, 4, 9))).toIntVector(), - (std::move(peek(stack, 5, 9))).toIntVector(), - (std::move(peek(stack, 6, 9))).toInt(), - output, - (std::move(peek(stack, 8, 9))).toScalar() - ); - auto result = at::Tensor(); - drop(stack, 9); - pack(stack, std::move(result)); - return 0; - }; + if (torch_ipex::check_auto_dnnl()) { + return [] (Stack& stack) { + auto output = (std::move(peek(stack, 7, 9))).toTensor(); + auto result = AtenIpexJITDev::dil_convolution_sum( + (std::move(peek(stack, 0, 9))).toTensor(), + (std::move(peek(stack, 1, 9))).toTensor(), + toOptionalTensor(std::move(peek(stack, 2, 9))), + (std::move(peek(stack, 3, 9))).toIntVector(), + (std::move(peek(stack, 4, 9))).toIntVector(), + (std::move(peek(stack, 5, 9))).toIntVector(), + (std::move(peek(stack, 6, 9))).toInt(), + output, + (std::move(peek(stack, 8, 9))).toScalar() + ); + drop(stack, 9); + pack(stack, std::move(result)); + return 0; + }; + } else { + TORCH_CHECK(false, "PyTorch native path not support convolution sum fusion now"); + } }, aliasAnalysisFromSchema() ), Operator( - "dnnl::conv2d_sum_relu(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, Tensor(a!) accumu, *, Scalar alpha=1) -> Tensor(a!)", + "ipex::conv2d_sum_relu(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups, Tensor(a!) accumu, *, Scalar alpha) -> Tensor(a!)", [] (const Node* node) ->Operation { - return [] (Stack& stack) { - auto output = (std::move(peek(stack, 7, 9))).toTensor(); - auto result = AtenIpexCPUDev::conv2d_sum_relu( - (std::move(peek(stack, 0, 9))).toTensor(), - (std::move(peek(stack, 1, 9))).toTensor(), - toOptionalTensor(std::move(peek(stack, 2, 9))), - (std::move(peek(stack, 3, 9))).toIntVector(), - (std::move(peek(stack, 4, 9))).toIntVector(), - (std::move(peek(stack, 5, 9))).toIntVector(), - (std::move(peek(stack, 6, 9))).toInt(), - output, - (std::move(peek(stack, 8, 9))).toScalar() - ); - auto result = at::Tensor(); - drop(stack, 9); - pack(stack, std::move(result)); - return 0; - }; + if (torch_ipex::check_auto_dnnl()) { + return [] (Stack& stack) { + auto output = (std::move(peek(stack, 7, 9))).toTensor(); + auto result = AtenIpexJITDev::dil_convolution_sum_relu( + (std::move(peek(stack, 0, 9))).toTensor(), + (std::move(peek(stack, 1, 9))).toTensor(), + toOptionalTensor(std::move(peek(stack, 2, 9))), + (std::move(peek(stack, 3, 9))).toIntVector(), + (std::move(peek(stack, 4, 9))).toIntVector(), + (std::move(peek(stack, 5, 9))).toIntVector(), + (std::move(peek(stack, 6, 9))).toInt(), + output, + (std::move(peek(stack, 8, 9))).toScalar() + ); + drop(stack, 9); + pack(stack, std::move(result)); + return 0; + }; + } else { + TORCH_CHECK(false, "PyTorch native path not support convolution sum relu fusion now"); + } }, aliasAnalysisFromSchema() - ),*/ + ) }); } } From e7ee4b389edc2bb02860c08ae4967bbc3ba77427 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Mon, 25 May 2020 18:55:44 +0800 Subject: [PATCH 04/10] make rewrited linear op can be traced --- intel_pytorch_extension_py/ops/linear.py | 27 +----------- torch_ipex/csrc/cpu/CustomerOps.h | 55 ++++++++++++++++++++++++ torch_ipex/csrc/cpu/DevOPs.cpp | 11 +++-- torch_ipex/csrc/cpu/DevOPs.h | 4 +- torch_ipex/csrc/cpu/ExtendOPs.cpp | 6 ++- torch_ipex/csrc/cpu/ExtendOPs.h | 2 +- torch_ipex/csrc/cpu/FusionOPs.cpp | 2 +- torch_ipex/csrc/cpu/RegisterOps.cpp | 11 +++++ torch_ipex/csrc/cpu/dbl/Common.cpp | 1 + torch_ipex/csrc/init_python_bindings.cpp | 2 +- 10 files changed, 83 insertions(+), 38 deletions(-) create mode 100644 torch_ipex/csrc/cpu/CustomerOps.h create mode 100644 torch_ipex/csrc/cpu/RegisterOps.cpp diff --git a/intel_pytorch_extension_py/ops/linear.py b/intel_pytorch_extension_py/ops/linear.py index 05a90b23b..8ec9e76c9 100644 --- a/intel_pytorch_extension_py/ops/linear.py +++ b/intel_pytorch_extension_py/ops/linear.py @@ -3,29 +3,4 @@ import torch.nn.functional as F import _torch_ipex as core -F_linear = F.linear - -class LinearFunction(Function): - @staticmethod - def forward(ctx, input, weight, bias): - output = core.linear(input, weight, bias) - ctx.save_for_backward(input, weight, bias) - return output - - @staticmethod - def backward(ctx, grad_output): - input, weight, bias = ctx.saved_tensors - grad_output = grad_output.contiguous() - if bias == None: - output_mask = (input.requires_grad, weight.requires_grad, 0) - else: - output_mask = (input.requires_grad, weight.requires_grad, bias.requires_grad) - grad_input, grad_weight, grad_bias = core.linear_backward(input, grad_output, weight, output_mask) - return (grad_input, grad_weight, grad_bias) - -def linear(input, weight, bias=None): - if input.device.type == 'dpcpp' and core.get_auto_dnnl(): - return LinearFunction.apply(input, weight, bias) - return F_linear(input, weight, bias) - -F.linear = linear +F.linear = torch.ops.torch_ipex.linear diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomerOps.h new file mode 100644 index 000000000..ea108ec4e --- /dev/null +++ b/torch_ipex/csrc/cpu/CustomerOps.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "torch_ipex/csrc/utils.h" +#include "DevOPs.h" + +using namespace at; + +class NewLinearOp : public torch::autograd::Function { + public: + static at::Tensor forward( + torch::autograd::AutogradContext* ctx, + at::Tensor input, + at::Tensor weight, + at::Tensor bias) { + ctx->save_for_backward({input, weight, bias}); + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + return torch_ipex::cpu::AtenIpexCPUDev::dil_linear(input, weight, bias); + } else { + return at::linear(input, weight, bias); + } + } + + static torch::autograd::tensor_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::tensor_list grad_outputs) { + auto saved = ctx->get_saved_variables(); + at::Tensor input = saved[0]; + at::Tensor weight = saved[1]; + at::Tensor bias = saved[2]; + + at::Tensor grad_output = grad_outputs[0]; + at::Tensor grad_input, grad_weight; + at::Tensor grad_bias = torch::Tensor(); + + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_input( + input.sizes(), grad_output, weight); + std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights( + grad_output, input, weight, bias.defined()); + } else { + auto grad_input = grad_output.mm(weight); + auto grad_weight = grad_output.t().mm(input); + if (bias.defined()) { + grad_bias = grad_output.sum(0); + } + } + return {grad_input, grad_weight, grad_bias}; + } +}; diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp index ebe231f42..9f405ed02 100644 --- a/torch_ipex/csrc/cpu/DevOPs.cpp +++ b/torch_ipex/csrc/cpu/DevOPs.cpp @@ -526,7 +526,7 @@ at::Tensor& AtenIpexCPUDev::dil_addbmm_( at::Tensor AtenIpexCPUDev::dil_linear( const at::Tensor& self, const at::Tensor& weight, - const c10::optional& bias) { + const at::Tensor& bias) { DEBUG("AtenIpexCPUDev::dil_linear\n"); CHECK_DNNL_OP_PRE_COND(self); CHECK_DNNL_OP_PRE_COND(weight); @@ -539,9 +539,8 @@ at::Tensor AtenIpexCPUDev::dil_linear( const dil::tensor w = dbl::comm::try_gen_dil_tensor(weight); dil::tensor y; - if (bias.has_value()) { - at::Tensor bias_vec = bias.value(); - const dil::tensor b = dbl::comm::try_gen_dil_tensor(bias_vec); + if (bias.defined()) { + const dil::tensor b = dbl::comm::try_gen_dil_tensor(bias); dil::inner_product_forward::compute(x, w, b, y); } else { dil::inner_product_forward::compute(x, w, y); @@ -599,7 +598,7 @@ at::Tensor AtenIpexCPUDev::dil_linear_fuse_relu( return dbl::comm::gen_aten_tensor_by(std::move(y)); } -at::Tensor dil_linear_backward_input( +at::Tensor AtenIpexCPUDev::dil_linear_backward_input( at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight){ DEBUG("AtenIpexCPUDev::dil_linear_backward_input\n"); auto grad_output_reshaped = grad_output.dim() > 2 ? @@ -621,7 +620,7 @@ at::Tensor dil_linear_backward_input( return dbl::comm::gen_aten_tensor_by(std::move(gradx)); } -std::tuple dil_linear_backward_weights( +std::tuple AtenIpexCPUDev::dil_linear_backward_weights( const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, bool bias_defined) { DEBUG("AtenIpexCPUDev::dil_linear_backward_weights\n"); auto grad_output_reshaped = grad_output.dim() > 2 ? diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h index 941bf93a8..49c47a199 100644 --- a/torch_ipex/csrc/cpu/DevOPs.h +++ b/torch_ipex/csrc/cpu/DevOPs.h @@ -38,8 +38,10 @@ class AtenIpexCPUDev { static at::Tensor dil_addbmm(const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha); static at::Tensor& dil_addbmm_(at::Tensor& self, const at::Tensor& batch1, const at::Tensor& batch2, at::Scalar beta, at::Scalar alpha); static at::Tensor& dil_addbmm_out(at::Tensor& result, const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha); - static at::Tensor dil_linear(const at::Tensor& self, const at::Tensor& weight, const c10::optional& bias); static at::Tensor dil_linear_fuse_relu(const at::Tensor& self, const at::Tensor& weight, const c10::optional& bias); + static at::Tensor dil_linear(const at::Tensor& self, const at::Tensor& weight, const at::Tensor& bias); + static at::Tensor dil_linear_backward_input(at::IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight); + static std::tuple dil_linear_backward_weights(const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, bool bias_defined); static std::tuple dil_linear_backward(const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, std::array output_mask); static at::Tensor dil_dropout(const at::Tensor& self, double ratio, bool train); static at::Tensor dil_dropout_backward(const at::Tensor& grady, const at::Tensor& mask, double ratio); diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp index bb11a869f..80d337cc3 100644 --- a/torch_ipex/csrc/cpu/ExtendOPs.cpp +++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp @@ -10,6 +10,7 @@ #include "xsmm/libxsmm_utils.h" #include "../utils.h" #include "DevOPs.h" +#include "CustomerOps.h" namespace torch_ipex { @@ -449,8 +450,9 @@ AtenIpexTypeExt::embedding_bag_backward(const at::Tensor& grad, const at::Tensor return cpu::aten::embedding_bag::embedding_bag_backward_impl(grad, indices, offsets, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, sparse, _per_sample_weights); } -at::Tensor AtenIpexTypeExt::linear(const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias) { - return cpu::AtenIpexCPUDev::dil_linear(input, weight, bias); + +at::Tensor AtenIpexTypeExt::linear(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) { + return NewLinearOp::apply(input, weight, bias); } at::Tensor AtenIpexTypeExt::linear_fuse_relu(const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias) { diff --git a/torch_ipex/csrc/cpu/ExtendOPs.h b/torch_ipex/csrc/cpu/ExtendOPs.h index 9305e454b..dedc3e2a4 100644 --- a/torch_ipex/csrc/cpu/ExtendOPs.h +++ b/torch_ipex/csrc/cpu/ExtendOPs.h @@ -23,8 +23,8 @@ class AtenIpexTypeExt { int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional& per_sample_weights); - static at::Tensor linear(const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias); static at::Tensor linear_fuse_relu(const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias); + static at::Tensor linear(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias = at::Tensor()); static std::tuple linear_backward(const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, std::array output_mask); static at::Tensor relu_use_dst_for_bwd(const at::Tensor& grad_output, const at::Tensor& output); static at::Tensor adaptive_avg_pool2d(at::Tensor const& input, at::IntArrayRef output_size); diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp index c87c0940e..2e3e1b9c5 100644 --- a/torch_ipex/csrc/cpu/FusionOPs.cpp +++ b/torch_ipex/csrc/cpu/FusionOPs.cpp @@ -59,7 +59,7 @@ static at::Tensor& dil_convolution_inplace_fusion( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, - at::Tensor& accumu, + at::Tensor& accumu, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp new file mode 100644 index 000000000..16e017c8b --- /dev/null +++ b/torch_ipex/csrc/cpu/RegisterOps.cpp @@ -0,0 +1,11 @@ +#include +#include "ExtendOPs.h" + +static auto registry = + torch::RegisterOperators() + .op("torch_ipex::linear", + [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) { + return torch_ipex::AtenIpexTypeExt::linear(input, weight, bias); + }); + + diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp index 913643ce3..13cabe94e 100644 --- a/torch_ipex/csrc/cpu/dbl/Common.cpp +++ b/torch_ipex/csrc/cpu/dbl/Common.cpp @@ -91,6 +91,7 @@ void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tenso dil::dims sizes = dil_tensor.get_dims(); if (dil_tensor.is_public_format()) { dil::dims strides = dil_tensor.get_strides(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.device().type() == at::DeviceType::DPCPP); auto* _tensor_impl = (IPEXTensorImpl *)ipex_tensor.unsafeGetTensorImpl(); _tensor_impl->force_set_strided(sizes, strides); } else { diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp index 33492bf09..33066cd5a 100644 --- a/torch_ipex/csrc/init_python_bindings.cpp +++ b/torch_ipex/csrc/init_python_bindings.cpp @@ -94,7 +94,7 @@ void InitIpexModuleBindings(py::module m) { }); m.def("linear", - [](const at::Tensor& input, const at::Tensor& weight, const c10::optional& bias) { + [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) { return AtenIpexTypeExt::linear(input, weight, bias); }); m.def("linear_fuse_relu", From 85e71702bd11d72e008d25e956bb0ff520911aef Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Tue, 26 May 2020 21:22:17 +0800 Subject: [PATCH 05/10] make rewrited max_pool2d op can be traced --- intel_pytorch_extension_py/ops/pooling.py | 18 ++++---- torch_ipex/csrc/cpu/CustomerOps.h | 54 +++++++++++++++++++++++ torch_ipex/csrc/cpu/ExtendOPs.cpp | 2 +- torch_ipex/csrc/cpu/RegisterOps.cpp | 8 ++-- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py index 7ff457d56..35710b70f 100644 --- a/intel_pytorch_extension_py/ops/pooling.py +++ b/intel_pytorch_extension_py/ops/pooling.py @@ -2,7 +2,10 @@ from torch.autograd import Function import torch.nn.functional as F import _torch_ipex as core -from torch.nn.modules.utils import _single +from torch.nn.modules.utils import _single, _pair +from typing import List + +Vector = List[int] torch_adaptive_avg_pool2d = torch._C._nn.adaptive_avg_pool2d torch_max_pool2d = torch.max_pool2d @@ -49,14 +52,6 @@ def adaptive_avg_pool2d(input, output_size): pass return torch_adaptive_avg_pool2d(input, output_size) -def max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode): - try: - if input.device.type == 'dpcpp' and core.get_auto_dnnl(): - return MaxPoolingFunction.apply(input, kernel_size, stride, padding, dilation, ceil_mode) - except RuntimeError: - pass - return torch_max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode) - def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode): try: if input.device.type == 'dpcpp' and core.get_auto_dnnl(): @@ -65,6 +60,9 @@ def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode): pass return torch_max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode) +def max_pool2d(input, kernel_size: Vector, stride: Vector, padding: Vector, dilation: Vector, ceil_mode: bool): + return torch.ops.torch_ipex.max_pool2d(input, _pair(kernel_size), _pair(stride), _pair(padding), _pair(dilation), ceil_mode) + torch._C._nn.adaptive_avg_pool2d = adaptive_avg_pool2d torch.max_pool2d = max_pool2d -torch.max_pool3d = max_pool3d \ No newline at end of file +torch.max_pool3d = max_pool3d diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomerOps.h index ea108ec4e..96a94e7db 100644 --- a/torch_ipex/csrc/cpu/CustomerOps.h +++ b/torch_ipex/csrc/cpu/CustomerOps.h @@ -53,3 +53,57 @@ class NewLinearOp : public torch::autograd::Function { return {grad_input, grad_weight, grad_bias}; } }; + +class NewMaxPoolingOp : public torch::autograd::Function { + public: + static at::Tensor forward( + torch::autograd::AutogradContext* ctx, + at::Tensor input, + at::IntArrayRef kernel_size, + at::IntArrayRef stride, + at::IntArrayRef padding, + at::IntArrayRef dilation, + bool ceil_mode) { + ctx->saved_data["kernel_size"] = kernel_size; + ctx->saved_data["stride"] = stride; + ctx->saved_data["padding"] = padding; + ctx->saved_data["dilation"] = dilation; + ctx->saved_data["ceil_mode"] = ceil_mode; + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride, + padding, dilation, ceil_mode); + ctx->save_for_backward({input, output}); + return output; + } else { + at::Tensor output, indices; + std::tie(output, indices) = at::max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode); + ctx->save_for_backward({input, indices}); + return output; + } + } + + static torch::autograd::tensor_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::tensor_list grad_outputs) { + auto saved = ctx->get_saved_variables(); + at::Tensor input = saved[0]; + at::Tensor indices = saved[1]; + + at::Tensor grad_output = grad_outputs[0]; + at::Tensor grad_input; + at::IntArrayRef kernel_size = at::IntArrayRef(ctx->saved_data["kernel_size"].toIntVector()); + at::IntArrayRef stride = at::IntArrayRef(ctx->saved_data["stride"].toIntVector()); + at::IntArrayRef padding = at::IntArrayRef(ctx->saved_data["padding"].toIntVector()); + at::IntArrayRef dilation = at::IntArrayRef(ctx->saved_data["dilation"].toIntVector()); + bool ceil_mode = ctx->saved_data["ceil_mode"].toBool(); + + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling_backward( + grad_output, indices, input, kernel_size, stride, padding, dilation, ceil_mode); + } else { + grad_input = at::max_pool2d_with_indices_backward(grad_output, input, kernel_size, + stride, padding, dilation, ceil_mode, indices); + } + return {grad_input}; + } +}; diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp index 80d337cc3..ed87fb9a5 100644 --- a/torch_ipex/csrc/cpu/ExtendOPs.cpp +++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp @@ -474,7 +474,7 @@ at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d_backward(const at::Tensor& grad_ } at::Tensor AtenIpexTypeExt::max_pooling(const at::Tensor& input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) { - return cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride, padding, dilation, ceil_mode); + return NewMaxPoolingOp::apply(input, kernel_size, stride, padding, dilation, ceil_mode); } at::Tensor AtenIpexTypeExt::max_pooling_backward(const at::Tensor& grad_output, const at::Tensor& output, const at::Tensor& input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) { diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp index 16e017c8b..f241fe341 100644 --- a/torch_ipex/csrc/cpu/RegisterOps.cpp +++ b/torch_ipex/csrc/cpu/RegisterOps.cpp @@ -3,9 +3,9 @@ static auto registry = torch::RegisterOperators() - .op("torch_ipex::linear", - [](const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias) { - return torch_ipex::AtenIpexTypeExt::linear(input, weight, bias); + .op("torch_ipex::linear", &torch_ipex::AtenIpexTypeExt::linear) + .op("torch_ipex::max_pool2d", [](const at::Tensor& self, c10::List kernel_size, + c10::List stride, c10::List padding, c10::List dilation, bool ceil_mode=false){ + return torch_ipex::AtenIpexTypeExt::max_pooling(self, kernel_size.vec(), stride.vec(), padding.vec(), dilation.vec(), ceil_mode); }); - From 2cca38f91f01ce3af535dd7f563935c2d2d85232 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Wed, 27 May 2020 10:00:24 +0800 Subject: [PATCH 06/10] fix max_pool2d backward floating point exception issue --- .../csrc/cpu/{CustomerOps.h => CustomOPs.h} | 20 +++++++++---------- torch_ipex/csrc/cpu/ExtendOPs.cpp | 2 +- torch_ipex/csrc/cpu/FusionOPs.cpp | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) rename torch_ipex/csrc/cpu/{CustomerOps.h => CustomOPs.h} (86%) diff --git a/torch_ipex/csrc/cpu/CustomerOps.h b/torch_ipex/csrc/cpu/CustomOPs.h similarity index 86% rename from torch_ipex/csrc/cpu/CustomerOps.h rename to torch_ipex/csrc/cpu/CustomOPs.h index 96a94e7db..a989d6d60 100644 --- a/torch_ipex/csrc/cpu/CustomerOps.h +++ b/torch_ipex/csrc/cpu/CustomOPs.h @@ -9,8 +9,6 @@ #include "torch_ipex/csrc/utils.h" #include "DevOPs.h" -using namespace at; - class NewLinearOp : public torch::autograd::Function { public: static at::Tensor forward( @@ -40,9 +38,9 @@ class NewLinearOp : public torch::autograd::Function { if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_input( - input.sizes(), grad_output, weight); + input.sizes(), grad_output.contiguous(), weight); std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights( - grad_output, input, weight, bias.defined()); + grad_output.contiguous(), input, weight, bias.defined()); } else { auto grad_input = grad_output.mm(weight); auto grad_weight = grad_output.t().mm(input); @@ -69,6 +67,7 @@ class NewMaxPoolingOp : public torch::autograd::Function { ctx->saved_data["padding"] = padding; ctx->saved_data["dilation"] = dilation; ctx->saved_data["ceil_mode"] = ceil_mode; + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input, kernel_size, stride, padding, dilation, ceil_mode); @@ -89,12 +88,13 @@ class NewMaxPoolingOp : public torch::autograd::Function { at::Tensor input = saved[0]; at::Tensor indices = saved[1]; - at::Tensor grad_output = grad_outputs[0]; + at::Tensor grad_output = grad_outputs[0].contiguous(); at::Tensor grad_input; - at::IntArrayRef kernel_size = at::IntArrayRef(ctx->saved_data["kernel_size"].toIntVector()); - at::IntArrayRef stride = at::IntArrayRef(ctx->saved_data["stride"].toIntVector()); - at::IntArrayRef padding = at::IntArrayRef(ctx->saved_data["padding"].toIntVector()); - at::IntArrayRef dilation = at::IntArrayRef(ctx->saved_data["dilation"].toIntVector()); + + std::vector kernel_size = ctx->saved_data["kernel_size"].toIntVector(); + std::vector stride = ctx->saved_data["stride"].toIntVector(); + std::vector padding = ctx->saved_data["padding"].toIntVector(); + std::vector dilation = ctx->saved_data["dilation"].toIntVector(); bool ceil_mode = ctx->saved_data["ceil_mode"].toBool(); if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { @@ -104,6 +104,6 @@ class NewMaxPoolingOp : public torch::autograd::Function { grad_input = at::max_pool2d_with_indices_backward(grad_output, input, kernel_size, stride, padding, dilation, ceil_mode, indices); } - return {grad_input}; + return {grad_input, at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()}; } }; diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp index ed87fb9a5..fbcb0281b 100644 --- a/torch_ipex/csrc/cpu/ExtendOPs.cpp +++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp @@ -10,7 +10,7 @@ #include "xsmm/libxsmm_utils.h" #include "../utils.h" #include "DevOPs.h" -#include "CustomerOps.h" +#include "CustomOps.h" namespace torch_ipex { diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp index 2e3e1b9c5..d9fec98fa 100644 --- a/torch_ipex/csrc/cpu/FusionOPs.cpp +++ b/torch_ipex/csrc/cpu/FusionOPs.cpp @@ -52,7 +52,7 @@ at::Tensor AtenIpexJITDev::dil_convolution_relu( groups, dil::attr_t::fuse_relu()); - return dbl::comm::gen_aten_tensor_by(dil_output); + return dbl::comm::gen_aten_tensor_by(std::move(dil_output)); } static at::Tensor& dil_convolution_inplace_fusion( From 2cfb394eb5036cd8686ad1dabe2768b6eb7b70ae Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Wed, 27 May 2020 16:26:52 +0800 Subject: [PATCH 07/10] make rewrited AdaptiveAvgPool2d op can be traced --- intel_pytorch_extension_py/ops/pooling.py | 25 ++-------------- torch_ipex/csrc/cpu/CustomOPs.h | 35 +++++++++++++++++++++++ torch_ipex/csrc/cpu/ExtendOPs.cpp | 4 +-- torch_ipex/csrc/cpu/RegisterOps.cpp | 3 ++ 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py index 35710b70f..64ce169de 100644 --- a/intel_pytorch_extension_py/ops/pooling.py +++ b/intel_pytorch_extension_py/ops/pooling.py @@ -7,24 +7,8 @@ Vector = List[int] -torch_adaptive_avg_pool2d = torch._C._nn.adaptive_avg_pool2d -torch_max_pool2d = torch.max_pool2d torch_max_pool3d = torch.max_pool3d -class AdaptiveAvgPool2dFunction(Function): - @staticmethod - def forward(ctx, input, output_size): - output = core.adaptive_avg_pool2d(input, _single(output_size)) - ctx.save_for_backward(input) - return output - - @staticmethod - def backward(ctx, grad_output): - (input,) = ctx.saved_tensors - grad_output = grad_output.contiguous() - grad_input = core.adaptive_avg_pool2d_backward(grad_output, input) - return (grad_input, None) - class MaxPoolingFunction(Function): @staticmethod def forward(ctx, input, kernel_size, stride, padding, dilation, ceil_mode): @@ -44,13 +28,8 @@ def backward(ctx, grad_output): grad_input = core.max_pooling_backward(grad_output, output, input, ctx.kernel_size, ctx.stride, ctx.padding, ctx.dilation, ctx.ceil_mode) return (grad_input, None, None, None, None, None) -def adaptive_avg_pool2d(input, output_size): - try: - if input.device.type == 'dpcpp' and core.get_auto_dnnl(): - return AdaptiveAvgPool2dFunction.apply(input, output_size) - except RuntimeError: - pass - return torch_adaptive_avg_pool2d(input, output_size) +def adaptive_avg_pool2d(input, output_size: Vector): + return torch.ops.torch_ipex.adaptive_avg_pool2d(input, _pair(output_size)) def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode): try: diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h index a989d6d60..3d681e564 100644 --- a/torch_ipex/csrc/cpu/CustomOPs.h +++ b/torch_ipex/csrc/cpu/CustomOPs.h @@ -107,3 +107,38 @@ class NewMaxPoolingOp : public torch::autograd::Function { return {grad_input, at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()}; } }; + +class NewApaptiveAvgPoolingOp : public torch::autograd::Function { + public: + static at::Tensor forward( + torch::autograd::AutogradContext* ctx, + at::Tensor input, + at::IntArrayRef output_size) { + ctx->save_for_backward({input}); + + at::Tensor output; + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + output = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input, output_size); + } else { + output = at::_adaptive_avg_pool2d(input, output_size); + } + return output; + } + + static torch::autograd::tensor_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::tensor_list grad_outputs) { + auto saved = ctx->get_saved_variables(); + at::Tensor input = saved[0]; + + at::Tensor grad_output = grad_outputs[0].contiguous(); + at::Tensor grad_input; + + if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { + grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d_backward(grad_output, input); + } else { + grad_input = at::_adaptive_avg_pool2d_backward(grad_output, input); + } + return {grad_input, at::Tensor()}; + } +}; diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp index fbcb0281b..a0cacd084 100644 --- a/torch_ipex/csrc/cpu/ExtendOPs.cpp +++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp @@ -10,7 +10,7 @@ #include "xsmm/libxsmm_utils.h" #include "../utils.h" #include "DevOPs.h" -#include "CustomOps.h" +#include "CustomOPs.h" namespace torch_ipex { @@ -466,7 +466,7 @@ std::tuple AtenIpexTypeExt::linear_backward( } at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d(at::Tensor const& input, at::IntArrayRef output_size) { - return cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input, output_size); + return NewApaptiveAvgPoolingOp::apply(input, output_size); } at::Tensor AtenIpexTypeExt::adaptive_avg_pool2d_backward(const at::Tensor& grad_output, const at::Tensor& input) { diff --git a/torch_ipex/csrc/cpu/RegisterOps.cpp b/torch_ipex/csrc/cpu/RegisterOps.cpp index f241fe341..694d0b9de 100644 --- a/torch_ipex/csrc/cpu/RegisterOps.cpp +++ b/torch_ipex/csrc/cpu/RegisterOps.cpp @@ -7,5 +7,8 @@ static auto registry = .op("torch_ipex::max_pool2d", [](const at::Tensor& self, c10::List kernel_size, c10::List stride, c10::List padding, c10::List dilation, bool ceil_mode=false){ return torch_ipex::AtenIpexTypeExt::max_pooling(self, kernel_size.vec(), stride.vec(), padding.vec(), dilation.vec(), ceil_mode); + }) + .op("torch_ipex::adaptive_avg_pool2d", [](const at::Tensor&self, c10::List output_size) { + return torch_ipex::AtenIpexTypeExt::adaptive_avg_pool2d(self, output_size.vec()); }); From f402c30fa6ceb00be3e6c8a138a62568dda688e7 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Thu, 28 May 2020 17:09:03 +0800 Subject: [PATCH 08/10] fix linear issue when bias is None --- intel_pytorch_extension_py/ops/linear.py | 8 +++++++- torch_ipex/csrc/cpu/CustomOPs.h | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/intel_pytorch_extension_py/ops/linear.py b/intel_pytorch_extension_py/ops/linear.py index 8ec9e76c9..ab9a5480e 100644 --- a/intel_pytorch_extension_py/ops/linear.py +++ b/intel_pytorch_extension_py/ops/linear.py @@ -2,5 +2,11 @@ from torch.autograd import Function import torch.nn.functional as F import _torch_ipex as core +from typing import Optional -F.linear = torch.ops.torch_ipex.linear +def linear(input, weight, bias: Optional[torch.Tensor] = None): + if bias is None: + bias = torch.zeros(weight.size(0)) + return torch.ops.torch_ipex.linear(input, weight, bias) + +F.linear = linear diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h index 3d681e564..2cea2ad05 100644 --- a/torch_ipex/csrc/cpu/CustomOPs.h +++ b/torch_ipex/csrc/cpu/CustomOPs.h @@ -15,7 +15,7 @@ class NewLinearOp : public torch::autograd::Function { torch::autograd::AutogradContext* ctx, at::Tensor input, at::Tensor weight, - at::Tensor bias) { + at::Tensor bias = at::Tensor()) { ctx->save_for_backward({input, weight, bias}); if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) { return torch_ipex::cpu::AtenIpexCPUDev::dil_linear(input, weight, bias); @@ -42,8 +42,8 @@ class NewLinearOp : public torch::autograd::Function { std::tie(grad_weight, grad_bias) = torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights( grad_output.contiguous(), input, weight, bias.defined()); } else { - auto grad_input = grad_output.mm(weight); - auto grad_weight = grad_output.t().mm(input); + grad_input = grad_output.mm(weight); + grad_weight = grad_output.t().mm(input); if (bias.defined()) { grad_bias = grad_output.sum(0); } From 746fe44f9188c84a13cc6950004f1d1b926fff0b Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Thu, 28 May 2020 19:01:57 +0800 Subject: [PATCH 09/10] fix max_pool2d issue with stride=None case --- intel_pytorch_extension_py/ops/pooling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/intel_pytorch_extension_py/ops/pooling.py b/intel_pytorch_extension_py/ops/pooling.py index 64ce169de..12114a91f 100644 --- a/intel_pytorch_extension_py/ops/pooling.py +++ b/intel_pytorch_extension_py/ops/pooling.py @@ -40,6 +40,8 @@ def max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode): return torch_max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode) def max_pool2d(input, kernel_size: Vector, stride: Vector, padding: Vector, dilation: Vector, ceil_mode: bool): + if not stride: + stride = kernel_size return torch.ops.torch_ipex.max_pool2d(input, _pair(kernel_size), _pair(stride), _pair(padding), _pair(dilation), ceil_mode) torch._C._nn.adaptive_avg_pool2d = adaptive_avg_pool2d From 0fb1060be8819e3a875fe6433f1a56898af71f65 Mon Sep 17 00:00:00 2001 From: "Zhang, Xiaobing" Date: Thu, 28 May 2020 19:40:51 +0800 Subject: [PATCH 10/10] add prepack_weight API --- torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp index bcfe69c27..2d5102b77 100644 --- a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp +++ b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp @@ -100,6 +100,19 @@ RegisterOperators op({ } }, aliasAnalysisFromSchema() + ), + Operator( + "ipex::prepack_weight(Tensor input, Tensor weight, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, int groups) -> Tensor(a!)", + [] (const Node* node) ->Operation { + if (torch_ipex::check_auto_dnnl()) { + return [] (Stack& stack) { + return 0; + }; + } else { + TORCH_CHECK(false, "PyTorch native path not support prepack weight now"); + } + }, + aliasAnalysisFromSchema() ) }); }