intel
diff --git a/‎ideep/ideep/operators/inner_product.hpp
Lines changed: 18 additions & 9 deletions b/‎ideep/ideep/operators/inner_product.hpp
Lines changed: 18 additions & 9 deletions
diff --git a/‎ideep/ideep/tensor.hpp
Lines changed: 8 additions & 0 deletions b/‎ideep/ideep/tensor.hpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎intel_pytorch_extension_py/utils.py
Lines changed: 5 additions & 1 deletion b/‎intel_pytorch_extension_py/utils.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎torch_ipex/csrc/cpu/Conv.cpp
Lines changed: 15 additions & 73 deletions b/‎torch_ipex/csrc/cpu/Conv.cpp
Lines changed: 15 additions & 73 deletions
diff --git a/‎torch_ipex/csrc/cpu/Conv.h
Lines changed: 0 additions & 3 deletions b/‎torch_ipex/csrc/cpu/Conv.h
Lines changed: 0 additions & 3 deletions
@@ -178,8 +178,8 @@ struct inner_product_forward : public dnnl::inner_product_forward {
       // align weights data type with src
       dst_data_type = src.get_data_type() == data_type::bf16 ? data_type::bf16
                                                              : data_type::f32;
-      src_desc = src.get_desc().to_type(dst_data_type);
-      weights_desc = weights.get_desc().to_type(dst_data_type);
+      src_desc = src.get_desc().to_type(dst_data_type).to_format_any();
+      weights_desc = weights.get_desc().to_type(dst_data_type).to_format_any();
       if (with_bias) {
         IDEEP_ENFORCE(utils::one_of(bias.get_data_type(),
                                     data_type::f32, data_type::bf16),
@@ -197,9 +197,16 @@ struct inner_product_forward : public dnnl::inner_product_forward {
 
     auto expected_src = src.reorder_if_differ_in(pd.src_desc(), src_attr);
     auto expected_weights = weights.reorder_if_differ_in(pd.weights_desc(), weights_attr);
-    dst.reinit_if_possible(pd.dst_desc());
-    if (!dst_scales.empty() && dst.get_data_type() != data_type::f32) {
-      dst.set_scale(dst_scales_in);
+    // [ Note output buffer ]
+    // In this case, dst is an empty ideep tensor, can be re-init
+    // If dst is not empty, ideep must write result to dst's memory and it is caller's duty to
+    // make sure dst is big enough to hold the result
+    if (dst.is_empty()) {
+      dst.init(pd.dst_desc());
+    }
+    auto expected_dst = dst.reorder_if_differ_in(pd.dst_desc());
+    if (!dst_scales.empty() && utils::one_of(dst.get_data_type(), data_type::u8, data_type::s8)) {  
+      expected_dst.set_scale(dst_scales_in);
     }
 
     if (with_bias){
@@ -208,17 +215,19 @@ struct inner_product_forward : public dnnl::inner_product_forward {
                         {{DNNL_ARG_SRC, expected_src},
                          {DNNL_ARG_WEIGHTS, expected_weights},
                          {DNNL_ARG_BIAS, expected_bias},
-                         {DNNL_ARG_DST, dst}});
+                         {DNNL_ARG_DST, expected_dst}});
     } else {
       super(pd).execute(stream::default_stream(),
                         {{DNNL_ARG_SRC, expected_src},
                          {DNNL_ARG_WEIGHTS, expected_weights},
-                         {DNNL_ARG_DST, dst}});
+                         {DNNL_ARG_DST, expected_dst}});
     }
 
-    if (attr.non_negitive_output() && dst.get_data_type() == data_type::s8) {
-      dst.to_type(data_type::u8);
+    if (attr.non_negitive_output() && expected_dst.get_data_type() == data_type::s8) {
+      expected_dst.to_type(data_type::u8);
     }
+    // reorder back to dst's buffer if needed
+    expected_dst.reorder_to_if_differ_from(dst);
   }
 };
 
 
@@ -663,6 +663,14 @@ class tensor : public memory {
     }
   }
 
+  // Reorder data from *this to dst if dst's memory desc(size, stride, format, etc) is different from *this;
+  void reorder_to_if_differ_from(tensor &dst, const attr_t &aattr = attr_t()) const {
+    if (dst.get_desc() != get_desc()) {
+      this->reorder_to(dst, aattr);
+    }
+    return;
+  }
+
   // workaround for issue intel/mkl-dnn#588
   desc _get_unblocked_desc_if_4c_blocked() const {
     auto desc = get_desc();
 
@@ -23,7 +23,11 @@ def convert_module_data_type(module, dtype):
     return module
 
 def optimize(model, dtype=torch.bfloat16, level='O1'):
-    optimized_model = conv_bn_fuse(model)
+    try:
+        optimized_model = conv_bn_fuse(model)
+    except:
+        warnings.warn("Conv BN folding failed during the optimize process.")
+        optimized_model = model
     if dtype == torch.bfloat16:
         optimized_model = convert_module_data_type(optimized_model, torch.bfloat16)
     return optimized_model
@@ -1,18 +1,11 @@
 #include "Conv.h"
 #include "mkldnn/MKLDNNCommon.h"
 #include "torch_ipex/csrc/utils.h"
+#include "WeightPrepack.h"
 
 namespace torch_ipex {
 namespace cpu {
 
-namespace {
-
-using weakref_type = c10::weak_intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>;
-using val_blocked = std::tuple<weakref_type, ideep::tensor>;
-thread_local std::unordered_map<c10::TensorImpl *, val_blocked> cached_weights;
-
-}  // namespace
-
 std::vector<int64_t> calc_conv_output_size(
     at::IntArrayRef input_size,
     at::IntArrayRef kernel_size,
@@ -30,61 +23,6 @@ std::vector<int64_t> calc_conv_output_size(
   return output_size;
 }
 
-ideep::tensor get_prepack_conv_weights(
-    const ideep::tensor& input,
-    const at::Tensor& weight,
-    at::IntArrayRef stride,
-    at::IntArrayRef padding,
-    at::IntArrayRef dilation,
-    int64_t groups,
-    const ideep::attr_t& attr) {
-  auto it = cached_weights.find(weight.unsafeGetTensorImpl());
-  if (it != cached_weights.end()) {
-    return std::get<1>(it->second);
-  } else {
-    ideep::tensor w = at::native::itensor_view_from_dense(weight);
-    // TODO: 3d check
-    bool is_channels_last = input.get_desc().is_nhwc();
-    ideep::tensor::desc packed_desc;
-    if (is_channels_last) {
-      packed_desc = ideep::convolution_forward::expected_weights_desc<true>(
-        w.get_dims(),
-        w.get_data_type(),
-        stride.vec(),
-        padding.vec(),
-        padding.vec(),
-        dilation.vec(),
-        groups,
-        ideep::algorithm::convolution_direct,
-        ideep::prop_kind::forward,
-        input.get_data_type(),
-        input.get_dims(),
-        attr);
-    } else {
-      packed_desc = ideep::convolution_forward::expected_weights_desc<false>(
-        w.get_dims(),
-        w.get_data_type(),
-        stride.vec(),
-        padding.vec(),
-        padding.vec(),
-        dilation.vec(),
-        groups,
-        ideep::algorithm::convolution_direct,
-        ideep::prop_kind::forward,
-        input.get_data_type(),
-        input.get_dims(),
-        attr);
-    }
-    ideep::tensor result;
-    result.init(packed_desc);
-    result.feed_from(w);
-    cached_weights.emplace(
-        weight.unsafeGetTensorImpl(),
-        val_blocked{weakref_type(weight.getIntrusivePtr()), result});
-    return result;
-  }
-}
-
 at::Tensor convolution_impl(
     const at::Tensor& input,
     const at::Tensor& weight,
@@ -96,22 +34,24 @@ at::Tensor convolution_impl(
     const ideep::attr_t& attr) {
 // TODO: the input will be actively converted to channels last format
 // after the 5-D tensor supports channels last format.
-  const ideep::tensor mkldnn_input = at::native::itensor_view_from_dense(input);
-  ideep::tensor mkldnn_weight = get_prepack_conv_weights(mkldnn_input, weight, stride, padding, dilation, groups, attr);
+  auto input_ = IS_CONTIGUOUS_ANY(input) ? input : input.contiguous();
+  const ideep::tensor mkldnn_input = at::native::itensor_view_from_dense(input_);
+  ideep::tensor mkldnn_weight = get_conv_prepacked_weight(mkldnn_input, weight, stride, padding, dilation, groups, attr);
   auto kernel_size = mkldnn_weight.get_dims();
   std::vector<int64_t> input_size = mkldnn_input.get_dims();
   std::vector<int64_t> output_sizes =
       calc_conv_output_size(input_size, kernel_size, padding, stride, dilation);
 
-  bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
-  auto output = at::empty(output_sizes, input.options().memory_format(input.suggest_memory_format()));
+  bool is_channels_last = input_.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
+  auto output = at::empty(output_sizes, input_.options().memory_format(input_.suggest_memory_format()));
   ideep::tensor mkldnn_output;
   if (is_channels_last) {
     mkldnn_output = at::native::itensor_view_from_dense(output);
   }
 
   if (bias.defined()) {
-    const ideep::tensor mkldnn_bias = at::native::itensor_view_from_dense(bias);
+    auto bias_ = IS_CONTIGUOUS_ANY(bias) ? bias : bias.contiguous();
+    const ideep::tensor mkldnn_bias = at::native::itensor_view_from_dense(bias_);
     ideep::convolution_forward::compute(
         mkldnn_input,
         mkldnn_weight,
@@ -165,20 +105,22 @@ void convolution_inplace_impl(
     const ideep::attr_t& attr) {
 // TODO: the input will be actively converted to channels last format
 // after the 5-D tensor supports channels last format.
-  const ideep::tensor mkldnn_input = at::native::itensor_view_from_dense(input);
-  ideep::tensor mkldnn_weight = get_prepack_conv_weights(mkldnn_input, weight, stride, padding, dilation, groups, attr);
+  auto input_ = IS_CONTIGUOUS_ANY(input) ? input : input.contiguous();
+  const ideep::tensor mkldnn_input = at::native::itensor_view_from_dense(input_);
+  ideep::tensor mkldnn_weight = get_conv_prepacked_weight(mkldnn_input, weight, stride, padding, dilation, groups, attr);
   auto kernel_size = mkldnn_weight.get_dims();
   std::vector<int64_t> input_size = mkldnn_input.get_dims();
   std::vector<int64_t> output_sizes =
       calc_conv_output_size(input_size, kernel_size, padding, stride, dilation);
 
-  bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
+  bool is_channels_last = input_.suggest_memory_format() == at::MemoryFormat::ChannelsLast;
   output = IS_CONTIGUOUS_ANY(output) ? output : output.contiguous();
-  output = output.to(input.suggest_memory_format());
+  output = output.to(input_.suggest_memory_format());
   ideep::tensor mkldnn_output = at::native::itensor_view_from_dense(output);
 
   if (bias.defined()) {
-    const ideep::tensor mkldnn_bias = at::native::itensor_view_from_dense(bias);
+    auto bias_ = IS_CONTIGUOUS_ANY(bias) ? bias : bias.contiguous();
+    const ideep::tensor mkldnn_bias = at::native::itensor_view_from_dense(bias_);
     ideep::convolution_forward::compute(
         mkldnn_input,
         mkldnn_weight,
 
@@ -1,11 +1,8 @@
 #pragma once
 
 #include <ATen/Tensor.h>
-
 #include "ideep/ideep.hpp"
 
-#include <vector>
-
 namespace torch_ipex {
 namespace cpu {
Original file line number	Diff line number	Diff line change
`@@ -663,6 +663,14 @@ class tensor : public memory {`
`663`	`663`	`}`
`664`	`664`	`}`
`665`	`665`
	`666`	`+ // Reorder data from this to dst if dst's memory desc(size, stride, format, etc) is different from this;`
	`667`	`+ void reorder_to_if_differ_from(tensor &dst, const attr_t &aattr = attr_t()) const {`
	`668`	`+ if (dst.get_desc() != get_desc()) {`
	`669`	`+ this->reorder_to(dst, aattr);`
	`670`	`+ }`
	`671`	`+ return;`
	`672`	`+ }`
	`673`	`+`
`666`	`674`	`// workaround for issue intel/mkl-dnn#588`
`667`	`675`	`desc _get_unblocked_desc_if_4c_blocked() const {`
`668`	`676`	`auto desc = get_desc();`