refine binary ops

pinzhenx · pinzhenx · commit 68fb1cf99272 · 2020-06-19T11:07:49.000Z
- support resizing behavior of out... parameter
- fallback to aten impl on broadcastable inputs
- remove workaround of broadcast in jit
diff --git a/intel_pytorch_extension_py/ops/jit_script.py b/intel_pytorch_extension_py/ops/jit_script.py
@@ -13,10 +13,7 @@ def script_(obj, optimize=None, _frames_up=0, _rcb=None):
     torch.jit.script = script_
 
     if core.get_jit():
-        # bypass buggy broadcastable ops in dnnl during folding
-        core.disable_auto_dnnl()
         jit_m = wrap_cpp_module(torch._C._jit_pass_fold_convbn(jit_m._c))
-        core.enable_auto_dnnl()
 
     return jit_m
 
diff --git a/tests/cpu/test_lazy_reorder.py b/tests/cpu/test_lazy_reorder.py
@@ -33,6 +33,13 @@ def get_rand_seed():
     return int(time.time() * 1000000000)
 
 device = ipex.DEVICE
+
+def convert_blocked(t):
+    assert t.dim() == 4, "only support converting 4d tensor"
+    c = t.size(1)
+    t = t.clone().to(device)
+    return F.conv2d(t, torch.ones(c, 1, 1, 1).to(device), groups=c)
+
 class TestConv(TestCase):
     def test_Conv2d_with_cpu(self):
         rand_seed = int(get_rand_seed())
@@ -202,6 +209,78 @@ def test_mul_(self):
         a2 = self._test_mul_('cpu', rand_seed)
         self.assertEqual(a2, a1.to('cpu'))
 
+    def test_mixed_format(self):
+        ipex.core.enable_auto_dnnl()
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+
+        shape = (2, 3, 4, 5)
+
+        for fname in ['add', 'mul']:
+
+            x_cpu = torch.ones(shape) * 5
+            y_cpu = torch.ones(shape) * 4
+
+            # block tensor is a dpcpp tensor
+            x_plain = x_cpu.clone().to(device)
+            y_plain = y_cpu.clone().to(device)
+            x_block = convert_blocked(x_cpu.clone())
+            y_block = convert_blocked(y_cpu.clone())
+
+            fn = getattr(torch, fname)
+            ref = fn(x_cpu, y_cpu)
+
+            # test add, mul
+            def test_outplace(a, b):
+                a = a.clone()
+                b = b.clone()
+                self.assertEqual(fn(a, b), ref)
+
+            test_outplace(x_plain, y_plain)
+            test_outplace(x_plain, y_block)
+            test_outplace(y_block, x_plain)
+            test_outplace(x_block, y_block)
+
+            # test add_out, mul_out
+            def test_out(a, b, o):
+                a = a.clone()
+                b = b.clone()
+                o = o.clone()
+                y = fn(a, b, out=o)
+                self.assertEqual(y, ref)
+                self.assertEqual(o, ref)
+
+            out = torch.ones(shape).to(device)
+            test_out(x_plain, y_plain, out)
+            test_out(x_plain, y_block, out)
+            test_out(y_block, x_plain, out)
+            test_out(x_block, y_block, out)
+            out = torch.ones(1).to(device)
+            test_out(x_plain, y_plain, out)
+            test_out(x_plain, y_block, out)
+            test_out(y_block, x_plain, out)
+            test_out(x_block, y_block, out)
+
+            # test add_, mul_
+            def test_inplace(a, b):
+                a = a.clone()
+                b = b.clone()
+                y = getattr(a, fname + '_')(b)
+                self.assertEqual(a, ref)
+                self.assertEqual(y, ref)
+
+            test_inplace(x_plain, y_plain)
+            test_inplace(x_plain, y_block)
+            test_inplace(y_block, x_plain)
+            test_inplace(x_block, y_block)
+
+            # test broadcast
+            scalar = torch.ones(1).to(device)
+            self.assertEqual(fn(x_plain, scalar), fn(x_cpu, scalar))
+            self.assertEqual(fn(scalar, x_plain), fn(scalar, x_cpu))
+
+
 class TestRelu(TestCase):
     def _test_relu_(self, device, rand_seed):
         torch.manual_seed(rand_seed)
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -254,115 +254,95 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> AtenIpexCPUDev::mkldnn_convolution_
   return std::tuple<at::Tensor,at::Tensor,at::Tensor>(bridge::shallowUpgradeToDPCPPTensor(std::get<0>(_ipex_result)), bridge::shallowUpgradeToDPCPPTensor(std::get<1>(_ipex_result)), bridge::shallowUpgradeToDPCPPTensor(std::get<2>(_ipex_result)));
 }
 
-at::Tensor& AtenIpexCPUDev::dil_add_out(
+template<bool inplace>
+at::Tensor& dil_add_common(
     at::Tensor& result,
     const at::Tensor& self,
     const at::Tensor& other,
     at::Scalar alpha) {
-  DEBUG("AtenIpexCPUDev::dil_add_out\n");
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(other);
 
   TORCH_CHECK(self.sizes().equals(other.sizes()),
-      "dil_add not support broadcast yet");
-  auto inferred_size = self.sizes();
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+      "dil add not support broadcast yet");
 
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(other);
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
 
   auto x = dbl::comm::try_gen_dil_tensor(self);
   auto y = dbl::comm::try_gen_dil_tensor(other);
-  auto z = dbl::comm::try_gen_dil_tensor(result);
+  auto z = inplace ? x : dil::tensor();
 
   dil::sum::compute({1.0, alpha.to<float>()}, {x, y}, z);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(z.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, z);
+  if (!inplace) {
+    dbl::comm::equip_dil_buffer(result, z);
+  }
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_add(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
-  DEBUG("AtenIpexCPUDev::dil_add\n");
-  CHECK_DNNL_OP_PRE_COND(self);
-  CHECK_DNNL_OP_PRE_COND(other);
-
-  TORCH_CHECK(self.sizes().equals(other.sizes()),
-      "dil_add not support broadcast yet");
-
-  dbl::comm::reorder_to_bf16_for_mix_prec(self);
-  dbl::comm::reorder_to_bf16_for_mix_prec(other);
+at::Tensor& AtenIpexCPUDev::dil_add_out(at::Tensor& result, const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
+  DEBUG("AtenIpexCPUDev::dil_add_out\n");
 
-  auto x = dbl::comm::try_gen_dil_tensor(self);
-  auto y = dbl::comm::try_gen_dil_tensor(other);
-  dil::tensor z;
+  return dil_add_common</*inplace=*/false>(result, self, other, alpha);
+}
 
-  dil::sum::compute({1.0, alpha.to<float>()}, {x, y}, z);
+at::Tensor AtenIpexCPUDev::dil_add(const at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
+  DEBUG("AtenIpexCPUDev::dil_add\n");
 
-  return dbl::comm::gen_aten_tensor_by(std::move(z));
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
+  return dil_add_common</*inplace=*/false>(result, self, other, alpha);
 }
 
 at::Tensor & AtenIpexCPUDev::dil_add_(at::Tensor& self, const at::Tensor& other, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_add_\n");
 
-  return dil_add_out(self, self, other, alpha);
+  return dil_add_common</*inplace=*/true>(self, self, other, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_mul_out(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) {
-  DEBUG("AtenIpexCPUDev::dil_mul_out\n");
-  CHECK_DNNL_OP_PRE_COND(result);
+template<bool inplace>
+at::Tensor& dil_mul_common(
+    at::Tensor& result,
+    const at::Tensor& self,
+    const at::Tensor& other) {
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(other);
 
   TORCH_CHECK(self.sizes().equals(other.sizes()),
-      "dil_mul not support broadcast yet");
-  auto inferred_size = self.sizes();
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+      "dil mul not support broadcast yet");
 
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(other);
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
 
-  auto dil_result = dbl::comm::try_gen_dil_tensor(result);
-  auto dil_self = dbl::comm::try_gen_dil_tensor(self);
-  auto dil_other = dbl::comm::try_gen_dil_tensor(other);
+  auto x = dbl::comm::try_gen_dil_tensor(self);
+  auto y = dbl::comm::try_gen_dil_tensor(other);
+  auto z = inplace ? x : dil::tensor();
 
-  dil::binary::compute(dil_self, dil_other, dil_result, dil::algorithm::binary_mul);
+  dil::binary::compute(x, y, z, dil::algorithm::binary_mul);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dil_result.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, dil_result);
+  if (!inplace) {
+    dbl::comm::equip_dil_buffer(result, z);
+  }
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_mul(const at::Tensor& self, const at::Tensor& other) {
-  DEBUG("AtenIpexCPUDev::dil_mul\n");
-  CHECK_DNNL_OP_PRE_COND(self);
-  CHECK_DNNL_OP_PRE_COND(other);
-
-  TORCH_CHECK(self.sizes().equals(other.sizes()),
-      "dil_mul not support broadcast yet");
-
-  dbl::comm::reorder_to_bf16_for_mix_prec(self);
-  dbl::comm::reorder_to_bf16_for_mix_prec(other);
+at::Tensor& AtenIpexCPUDev::dil_mul_out(at::Tensor& result, const at::Tensor& self, const at::Tensor& other) {
+  DEBUG("AtenIpexCPUDev::dil_mul_out\n");
 
-  auto x = dbl::comm::try_gen_dil_tensor(self);
-  auto y = dbl::comm::try_gen_dil_tensor(other);
-  dil::tensor z;
+  return dil_mul_common</*inplace=*/false>(result, self, other);
+}
 
-  dil::binary::compute(x, y, z, dil::algorithm::binary_mul);
+at::Tensor AtenIpexCPUDev::dil_mul(const at::Tensor& self, const at::Tensor& other) {
+  DEBUG("AtenIpexCPUDev::dil_mul\n");
 
-  return dbl::comm::gen_aten_tensor_by(std::move(z));
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
+  return dil_mul_common</*inplace=*/false>(result, self, other);
 }
 
 at::Tensor& AtenIpexCPUDev::dil_mul_(at::Tensor& self, const at::Tensor& other) {
   DEBUG("AtenIpexCPUDev::dil_mul_\n");
 
-  return dil_mul_out(self, self, other);
+  return dil_mul_common</*inplace=*/true>(self, self, other);
 }
 
 void matmul_common(
diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp
@@ -89,6 +89,15 @@ void reorder_to_desc(const at::Tensor& tensor, const dil::tensor::desc& expected
 }
 
 void equip_dil_buffer(const at::Tensor& tensor, dil::tensor dil_tensor_buffer) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      tensor.device().is_dpcpp(),
+      "dil buffer can only be equipped to dpcpp tensor");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      check_tensor_own_whole_storage(tensor),
+      "dil buffer can only be equipped to tensors that own the whole storage, "
+      "as dil buffer is going to replace the original storage");
+
   // Build new shade data context
   cpu::ShadeDataContext *new_shade_data_context = cpu::ShadeDataContext::allocShadeDataContext();
   new_shade_data_context->data_type = cpu::SHADE_DATA_TYPE::DIL;
@@ -97,13 +106,10 @@ void equip_dil_buffer(const at::Tensor& tensor, dil::tensor dil_tensor_buffer) {
   void *tensor_data = nullptr;
   if (dil_tensor_buffer.get_data_type() != get_dil_data_type(tensor.scalar_type())) {
     new_shade_data_context->mix_prec_type = cpu::MIX_PREC_TYPE::MIX_BF16_FP32;
-  } else {
-    if (dil_tensor_buffer.is_public_format()) {
-      tensor_data = dil_tensor_buffer.get_data_handle();
-      new_shade_data_context->cpu_raw_data = tensor_data;
-      new_shade_data_context->cpu_del_fun = &(c10::detail::deleteNothing);
-      sync_shape_from_dil_to_aten(tensor, dil_tensor_buffer);
-    }
+  } else if (dil_tensor_buffer.is_public_format()) {
+    tensor_data = dil_tensor_buffer.get_data_handle();
+    new_shade_data_context->cpu_raw_data = tensor_data;
+    new_shade_data_context->cpu_del_fun = &(c10::detail::deleteNothing);
   }
 
   // Create a new DataPtr instances because the DataPtr class does not support set
@@ -116,6 +122,12 @@ void equip_dil_buffer(const at::Tensor& tensor, dil::tensor dil_tensor_buffer) {
 
   IPEXTensorImpl* ipex_tensor_impl = (IPEXTensorImpl *)tensor.unsafeGetTensorImpl();
   ipex_tensor_impl->storage().set_data_ptr(std::move(shade_data_ptr));
+ 
+  // After equip_dil_buffer(), whole storage should be managed by dil tensor,
+  // and thus storage metadata should be overwritten by dil tensor 
+  // Note: Storage::set_numel() might be removed later
+  ipex_tensor_impl->storage().set_numel(dil_tensor_buffer.get_nelems());
+  cpu::dbl::comm::sync_shape_from_dil_to_aten(tensor, dil_tensor_buffer);
 }
 
 dil::tensor try_gen_dil_tensor(const at::Tensor &input) {