fix *mm ops and resizing behavior

pinzhenx · pinzhenx · commit 6b835d29b0af · 2020-06-19T14:45:41.000Z
diff --git a/tests/cpu/test_lazy_reorder.py b/tests/cpu/test_lazy_reorder.py
@@ -467,6 +467,11 @@ def test_addmm(self):
                 torch.addmm(input=res_dpcpp, mat1=b1_dpcpp, mat2=b2_dpcpp, alpha=alpha, beta=beta, out=y_dpcpp)
                 self.assertEqual(y_cpu, y_dpcpp)
 
+                res_cpu.addmm_(mat1=b1_cpu, mat2=b2_cpu, alpha=alpha, beta=beta)
+                res_dpcpp.addmm_(mat1=b1_cpu, mat2=b2_cpu, alpha=alpha, beta=beta)
+                self.assertEqual(res_cpu, res_dpcpp)
+
+
     def test_addbmm(self):
         ipex.enable_auto_dnnl()
         rand_seed = int(get_rand_seed())
@@ -494,6 +499,10 @@ def test_addbmm(self):
                 torch.addbmm(res_dpcpp, b1_dpcpp, b2_dpcpp, beta=beta, alpha=alpha, out=y_dpcpp)
                 self.assertEqual(y_cpu, y_dpcpp, 1e-4)
 
+                res_cpu.addbmm_(b1_cpu, b2_cpu, beta=beta, alpha=alpha)
+                res_dpcpp.addbmm_(b1_dpcpp, b2_dpcpp, beta=beta, alpha=alpha)
+                self.assertEqual(res_cpu, res_dpcpp, 1e-4)
+
     def test_baddbmm(self):
         ipex.enable_auto_dnnl()
         rand_seed = int(get_rand_seed())
@@ -520,6 +529,9 @@ def test_baddbmm(self):
                 torch.baddbmm(res_cpu, b1_cpu, b2_cpu, alpha=alpha, beta=beta, out=y_cpu),
                 torch.baddbmm(res_dpcpp, b1_dpcpp, b2_dpcpp, alpha=alpha, beta=beta, out=y_dpcpp),
                 self.assertEqual(y_cpu, y_dpcpp)
+                res_cpu.baddbmm_(b1_cpu, b2_cpu, alpha=alpha, beta=beta)
+                res_dpcpp.baddbmm_(b1_cpu, b2_cpu, alpha=alpha, beta=beta)
+                self.assertEqual(res_cpu, res_dpcpp)
 
 class TestLinear(TestCase):
     def test_linear(self):
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -370,99 +370,77 @@ void matmul_common(
       dil::scale_t(), dil::scale_t(), dil::scale_t(), attr);
 }
 
-at::Tensor AtenIpexCPUDev::dil_bmm(
-    const at::Tensor& self,
-    const at::Tensor& mat2) {
+at::Tensor AtenIpexCPUDev::dil_bmm(const at::Tensor& self, const at::Tensor& mat2) {
   DEBUG("AtenIpexCPUDev::dil_bmm\n");
 
-  at::Tensor result = at::empty({0}, self.options());
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
   return dil_bmm_out(result, self, mat2);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_bmm_out(
-    at::Tensor &result,
-    const at::Tensor& batch1,
-    const at::Tensor& batch2) {
+at::Tensor& AtenIpexCPUDev::dil_bmm_out(at::Tensor &result, const at::Tensor& batch1, const at::Tensor& batch2) {
   DEBUG("AtenIpexCPUDev::dil_bmm_out\n");
   CHECK_DNNL_OP_PRE_COND(batch1);
   CHECK_DNNL_OP_PRE_COND(batch2);
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batch1.dim() == 3 && batch2.dim() == 3);
-  at::IntArrayRef inferred_size{batch1.size(0), batch1.size(1), batch2.size(2)};
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+  dil::dims inferred_size{batch1.size(0), batch1.size(1), batch2.size(2)};
 
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch1);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch2);
 
   auto x = dbl::comm::try_gen_dil_tensor(batch1);
   auto w = dbl::comm::try_gen_dil_tensor(batch2);
-  auto y = dbl::comm::try_gen_dil_tensor(result);
+  dil::tensor y;
   matmul_common(x, w, dil::tensor(), y);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(y.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, y);
+  dbl::comm::equip_dil_buffer(result, y);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(inferred_size));
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_mm(
-    const at::Tensor& self,
-    const at::Tensor& mat2) {
+at::Tensor AtenIpexCPUDev::dil_mm(const at::Tensor& self, const at::Tensor& mat2) {
   DEBUG("AtenIpexCPUDev::dil_mm\n");
 
-  at::Tensor result = at::empty({0}, self.options());
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
   return dil_mm_out(result, self, mat2);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_mm_out(
-    at::Tensor& result,
-    const at::Tensor& self,
-    const at::Tensor& mat2) {
+at::Tensor& AtenIpexCPUDev::dil_mm_out(at::Tensor& result, const at::Tensor& self, const at::Tensor& mat2) {
   DEBUG("AtenIpexCPUDev::dil_mm_out\n");
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.dim() == 2 && mat2.dim() == 2);
-  at::IntArrayRef inferred_size{self.size(0), mat2.size(1)};
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+  dil::dims inferred_size{self.size(0), mat2.size(1)};
 
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(mat2);
 
   auto x = dbl::comm::try_gen_dil_tensor(self);
   auto w = dbl::comm::try_gen_dil_tensor(mat2);
-  auto y = dbl::comm::try_gen_dil_tensor(result);
+  dil::tensor y;
   matmul_common(x, w, dil::tensor(), y);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(y.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, y);
+  dbl::comm::equip_dil_buffer(result, y);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(inferred_size));
   return result;
 }
 
-at::Tensor& AtenIpexCPUDev::dil_baddbmm_out(
+template <bool inplace>
+at::Tensor& dil_baddbmm_common(
     at::Tensor &result,
     const at::Tensor& self,
     const at::Tensor& batch1,
     const at::Tensor& batch2,
     at::Scalar beta,
     at::Scalar alpha) {
-  DEBUG("AtenIpexCPUDev::dil_baddbmm_out\n");
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(batch1);
   CHECK_DNNL_OP_PRE_COND(batch2);
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batch1.dim() == 3 && batch2.dim() == 3);
-  at::IntArrayRef inferred_size{batch1.size(0), batch1.size(1), batch2.size(2)};
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+  dil::dims inferred_size{batch1.size(0), batch1.size(1), batch2.size(2)};
   TORCH_CHECK(self.sizes().equals(inferred_size),
-      "dil_baddbmm not support broadcast yet");
+      "dil baddbmm not support broadcast yet");
 
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch1);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch2);
@@ -478,60 +456,59 @@ at::Tensor& AtenIpexCPUDev::dil_baddbmm_out(
       bias.reshape(bias_dims);
     }
   }
-  auto y = dbl::comm::try_gen_dil_tensor(result);
+  auto y = inplace ? dbl::comm::try_gen_dil_tensor(self) : dil::tensor();
   auto attr_ = dil::attr_t::fuse_sum();
   matmul_common(x, w, bias, y, beta, alpha, attr_);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(y.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, y);
+  if (!inplace) {
+    dbl::comm::equip_dil_buffer(result, y);
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(inferred_size));
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_baddbmm(
+at::Tensor& AtenIpexCPUDev::dil_baddbmm_out(
+    at::Tensor &result,
     const at::Tensor& self,
     const at::Tensor& batch1,
-    const at::Tensor & batch2,
+    const at::Tensor& batch2,
     at::Scalar beta,
     at::Scalar alpha) {
+  DEBUG("AtenIpexCPUDev::dil_baddbmm_out\n");
+
+  return dil_baddbmm_common</*inplace=*/false>(result, self, batch1, batch2, beta, alpha);
+}
+
+at::Tensor AtenIpexCPUDev::dil_baddbmm(const at::Tensor& self, const at::Tensor& batch1, const at::Tensor & batch2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_baddbmm\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_baddbmm_out(result, self, batch1, batch2, beta, alpha);
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
+  return dil_baddbmm_common</*inplace=*/false>(result, self, batch1, batch2, beta, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_baddbmm_(
-    at::Tensor& self,
-    const at::Tensor& batch1,
-    const at::Tensor& batch2,
-    at::Scalar beta,
-    at::Scalar alpha) {
+at::Tensor& AtenIpexCPUDev::dil_baddbmm_(at::Tensor& self, const at::Tensor& batch1, const at::Tensor& batch2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_baddbmm_\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_baddbmm_out(self, result, batch1, batch2, beta, alpha);
+  return dil_baddbmm_out(self, self, batch1, batch2, beta, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_addmm_out(
+template<bool inplace>
+at::Tensor& dil_addmm_common(
     at::Tensor& result,
     const at::Tensor& self,
     const at::Tensor& mat1,
     const at::Tensor& mat2,
     at::Scalar beta,
     at::Scalar alpha) {
-  DEBUG("AtenIpexCPUDev::dil_addmm_out\n");
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(mat1);
   CHECK_DNNL_OP_PRE_COND(mat2);
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.dim() == 2 && mat2.dim() == 2);
-  at::IntArrayRef inferred_size{mat1.size(0), mat2.size(1)};
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+  dil::dims inferred_size{mat1.size(0), mat2.size(1)};
   TORCH_CHECK(self.sizes().equals(inferred_size),
-      "dil_addmm not support broadcast yet");
+      "dil addmm not support broadcast yet");
 
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(mat1);
   dbl::comm::reorder_to_bf16_for_mix_prec(mat2);
@@ -547,60 +524,53 @@ at::Tensor& AtenIpexCPUDev::dil_addmm_out(
       bias.reshape(bias_dims);
     }
   }
-  auto y = dbl::comm::try_gen_dil_tensor(result);
+  auto y = inplace ? dbl::comm::try_gen_dil_tensor(self) : dil::tensor();
   auto attr_ = dil::attr_t::fuse_sum();
   matmul_common(x, w, bias, y, beta, alpha, attr_);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(y.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, y);
+  if (!inplace) {
+    dbl::comm::equip_dil_buffer(result, y);
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(inferred_size));
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_addmm(
-    const at::Tensor& self,
-    const at::Tensor& batch1,
-    const at::Tensor & batch2,
-    at::Scalar beta,
-    at::Scalar alpha) {
+at::Tensor& AtenIpexCPUDev::dil_addmm_out(at::Tensor& result, const at::Tensor& self, const at::Tensor& mat1, const at::Tensor& mat2, at::Scalar beta, at::Scalar alpha) {
+  DEBUG("AtenIpexCPUDev::dil_addmm_out\n");
+
+  return dil_addmm_common</*inplace=*/false>(result, self, mat1, mat2, beta, alpha);
+}
+
+at::Tensor AtenIpexCPUDev::dil_addmm(const at::Tensor& self, const at::Tensor& mat1, const at::Tensor & mat2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_addmm\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_addmm_out(result, self, batch1, batch2, beta, alpha);
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
+  return dil_addmm_common</*inplace=*/false>(result, self, mat1, mat2, beta, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_addmm_(
-    at::Tensor& self,
-    const at::Tensor& batch1,
-    const at::Tensor & batch2,
-    at::Scalar beta,
-    at::Scalar alpha) {
+at::Tensor& AtenIpexCPUDev::dil_addmm_(at::Tensor& self, const at::Tensor& mat1, const at::Tensor & mat2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_addmm_\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_addmm_out(self, result, batch1, batch2, beta, alpha);
+  return dil_addmm_common</*inplace=*/false>(self, self, mat1, mat2, beta, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_addbmm_out(
+template<bool inplace>
+at::Tensor& dil_addbmm_common(
     at::Tensor& result,
     const at::Tensor &self,
     const at::Tensor &batch1,
     const at::Tensor &batch2,
     at::Scalar beta,
     at::Scalar alpha) {
-  DEBUG("AtenIpexCPUDev::dil_addbmm_out\n");
   CHECK_DNNL_OP_PRE_COND(self);
   CHECK_DNNL_OP_PRE_COND(batch1);
   CHECK_DNNL_OP_PRE_COND(batch2);
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batch1.dim() == 3 && batch2.dim() == 3);
-  at::IntArrayRef inferred_size{batch1.size(1), batch2.size(2)};
-  if (!result.sizes().equals(inferred_size)) {
-    result.resize_(inferred_size);
-  }
+  dil::dims inferred_size{batch1.size(1), batch2.size(2)};
   TORCH_CHECK(self.sizes().equals(inferred_size),
-      "dil_addbmm not support broadcast yet");
+      "dil addbmm not support broadcast yet");
 
-  dbl::comm::reorder_to_bf16_for_mix_prec(result);
   dbl::comm::reorder_to_bf16_for_mix_prec(self);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch1);
   dbl::comm::reorder_to_bf16_for_mix_prec(batch2);
@@ -616,11 +586,9 @@ at::Tensor& AtenIpexCPUDev::dil_addbmm_out(
   if (x.get_dim(0) > 1) {
     x_ = x.transpose(0, 1);
   }
-  dil::dims x_dims = {x.get_dim(1), x.get_dim(0) * x.get_dim(2)};
-  x_ = x_.reshape(x_dims);
-  dil::dims w_dims = {w.get_dim(0) * w.get_dim(1), w.get_dim(2)};
-  auto w_ = w.reshape(w_dims);
-  auto y = dbl::comm::try_gen_dil_tensor(result);
+  x_ = x_.reshape({x.get_dim(1), x.get_dim(0) * x.get_dim(2)});
+  auto w_ = w.reshape({w.get_dim(0) * w.get_dim(1), w.get_dim(2)});
+  auto y = inplace ? dbl::comm::try_gen_dil_tensor(self) : dil::tensor();
   auto attr_ = dil::attr_t::fuse_sum();
 
   dil::tensor bias;
@@ -634,33 +602,30 @@ at::Tensor& AtenIpexCPUDev::dil_addbmm_out(
   }
   matmul_common(x_, w_, bias, y, beta, alpha, attr_);
 
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(y.is_public_format() || check_tensor_own_whole_storage(result));
-  dbl::comm::sync_shape_from_dil_to_aten(result, y);
+  if (!inplace) {
+    dbl::comm::equip_dil_buffer(result, y);
+  }
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(inferred_size));
   return result;
 }
 
-at::Tensor AtenIpexCPUDev::dil_addbmm(
-    const at::Tensor &self,
-    const at::Tensor &batch1,
-    const at::Tensor &batch2,
-    at::Scalar beta,
-    at::Scalar alpha) {
+at::Tensor& AtenIpexCPUDev::dil_addbmm_out(at::Tensor& result, const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha) {
+  DEBUG("AtenIpexCPUDev::dil_addbmm_out\n");
+
+  return dil_addbmm_common</*inplace=*/false>(result, self, batch1, batch2, beta, alpha);
+}
+
+at::Tensor AtenIpexCPUDev::dil_addbmm(const at::Tensor &self, const at::Tensor &batch1, const at::Tensor &batch2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_addbmm\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_addbmm_out(result, self, batch1, batch2, beta, alpha);
+  auto result = dbl::comm::empty_dil_tensor({0}, self.options());
+  return dil_addbmm_common</*inplace=*/false>(result, self, batch1, batch2, beta, alpha);
 }
 
-at::Tensor& AtenIpexCPUDev::dil_addbmm_(
-    at::Tensor& self,
-    const at::Tensor& batch1,
-    const at::Tensor& batch2,
-    at::Scalar beta,
-    at::Scalar alpha) {
+at::Tensor& AtenIpexCPUDev::dil_addbmm_(at::Tensor& self, const at::Tensor& batch1, const at::Tensor& batch2, at::Scalar beta, at::Scalar alpha) {
   DEBUG("AtenIpexCPUDev::dil_addbmm_\n");
 
-  at::Tensor result = at::empty({0}, self.options());
-  return dil_addbmm_out(self, result, batch1, batch2, beta, alpha);
+  return dil_addbmm_common</*inplace=*/true>(self, self, batch1, batch2, beta, alpha);
 }
 
 at::Tensor AtenIpexCPUDev::dil_linear(