intel
diff --git a/‎torch_ipex/csrc/cpu/CustomOPs.h
Lines changed: 4 additions & 2 deletions b/‎torch_ipex/csrc/cpu/CustomOPs.h
Lines changed: 4 additions & 2 deletions
diff --git a/‎torch_ipex/csrc/cpu/DevOPs.cpp
Lines changed: 48 additions & 3 deletions b/‎torch_ipex/csrc/cpu/DevOPs.cpp
Lines changed: 48 additions & 3 deletions
diff --git a/‎torch_ipex/csrc/cpu/DevOPs.h
Lines changed: 1 addition & 1 deletion b/‎torch_ipex/csrc/cpu/DevOPs.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch_ipex/csrc/cpu/ExtendOPs.cpp
Lines changed: 30 additions & 4 deletions b/‎torch_ipex/csrc/cpu/ExtendOPs.cpp
Lines changed: 30 additions & 4 deletions
diff --git a/‎torch_ipex/csrc/cpu/ShadeDataContext.h
Lines changed: 10 additions & 0 deletions b/‎torch_ipex/csrc/cpu/ShadeDataContext.h
Lines changed: 10 additions & 0 deletions
@@ -714,15 +714,16 @@ class NewRNNLayerOp : public torch::autograd::Function<NewRNNLayerOp> {
 public:
   static std::vector<at::Tensor> _forward(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2,
     const at::Tensor& w3, const at::Tensor& w4, const at::Tensor& hx, const at::Tensor& cx, bool reverse, int64_t mode,
-    int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes) {
+    int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, 
+    const std::vector<float>& scales = {}, const std::vector<int32_t>& shift = {}, bool quantized = false) {
 #if defined(IPEX_PROFILE_OP)
     RECORD_FUNCTION("NewRNNLayerOp::_forward", std::vector<c10::IValue>({}));
 #endif
     try {
       if (torch_ipex::check_auto_dnnl() &&
           input.device().type() == c10::DeviceType::XPU) {
         return torch_ipex::cpu::AtenIpexCPUDev::dil_rnn_layer(
-            input, w1, w2, w3, w4, hx, cx, reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes);
+            input, w1, w2, w3, w4, hx, cx, reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, scales, shift, quantized);
       }
     } catch (std::exception &e) {
 #if defined(_DEBUG)
@@ -783,6 +784,7 @@ class NewRNNLayerOp : public torch::autograd::Function<NewRNNLayerOp> {
           grad_inputs[3], grad_inputs[4], grad_inputs[5],
           grad_inputs[6], at::Tensor(), at::Tensor(),
           at::Tensor(), at::Tensor(), at::Tensor(),
+          at::Tensor(), at::Tensor(), at::Tensor(),
           at::Tensor(), at::Tensor(), at::Tensor()};
       }
     } catch (std::exception &e) {
 
@@ -2075,17 +2075,60 @@ at::Tensor AtenIpexCPUDev::dil_cat(at::TensorList tensors, int64_t dim) {
   dim = at::legacy_cat_wrap_dim(dim, tensors);
   std::vector<dil::tensor> x;
   at::Tensor tensors_contiguous[tensors.size()];
+
+  bool has_scale = false;
+  bool has_shift = false;
+  dil::scale_t data_scale;
+  std::vector<int32_t> data_shift;
+
   for (auto i = 0; i < tensors.size(); i++) {
     IPEX_CHECK(!(tensors[i].dim() == 1 && tensors[i].sizes()[0] == 0),
       "Currently Mkldnn cat operators do not support empty tensor.");
     tensors_contiguous[i] = tensors[i].is_contiguous() ? tensors[i] : tensors[i].contiguous();
 
     dbl::comm::reorder_to_bf16_for_mix_prec(tensors_contiguous[i], true);
 
-    x.push_back(dbl::comm::try_gen_dil_tensor(tensors_contiguous[i]));
+    auto dil_input = dbl::comm::try_gen_dil_tensor(tensors_contiguous[i]);
+
+    // TODO: verify using a simpler way??
+    if (i == 0) {
+      if (dil_input.has_scale()) {
+        IPEX_CHECK(dil_input.get_scale().size() == 1, "only support scale size == 1");
+        has_scale = true;
+        data_scale = dil_input.get_scale();
+      }
+      if (dil_input.has_zero_point()) {
+        IPEX_CHECK(dil_input.get_zero_point().size() == 1, "only support zero point size == 1");
+        has_shift = true;
+        data_shift = dil_input.get_zero_point();
+      }
+    } else {
+      IPEX_CHECK(dil_input.has_scale() == has_scale, "tensors to cat should have same scale");
+      if (dil_input.has_scale()) {
+        IPEX_CHECK(dil_input.get_scale().size() == 1, "only support scale size == 1");
+        IPEX_CHECK(dil_input.get_scale()[0] == data_scale[0], "tensors to cat should have same scale");
+      }
+      IPEX_CHECK(dil_input.has_zero_point() == has_shift, "tensors to cat should have same zero point");
+      if (dil_input.has_zero_point()) {
+        IPEX_CHECK(dil_input.get_zero_point().size() == 1, "only support zero point size == 1");
+        IPEX_CHECK(dil_input.get_zero_point()[0] == data_shift[0], "tensors to cat should have same zero point");
+      }
+    }
+
+    x.push_back(dil_input);
   }
   dil::tensor y;
   dil::concat::compute(x, dim, y);
+
+  // For bidirectional LSTM output cat
+  if (has_scale){
+    y.set_scale(data_scale);
+  }
+
+  if (has_shift) {
+    y.set_zero_point(data_shift);
+  }
+
   return dbl::comm::gen_aten_tensor_by(std::move(y));
 }
 
@@ -2597,10 +2640,12 @@ at::Tensor& AtenIpexCPUDev::dil_copy_(
 
 std::vector<at::Tensor> AtenIpexCPUDev::dil_rnn_layer(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2,
     const at::Tensor& w3, const at::Tensor& w4, const at::Tensor& hx, const at::Tensor& cx, bool reverse, int64_t mode,
-    int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes) {
+    int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, 
+    const std::vector<float>& scales, const std::vector<int32_t>& shift, bool quantized) {
   DEBUG("AtenIpexCPUDev::dil_rnn_layer\n");
+
   return dbl::rnn::mkldnn_rnn_layer(input, w1, w2, w3, w4, hx, cx, reverse, mode,
-      hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes);
+      hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, scales, shift, quantized);
 }
 
 std::vector<at::Tensor> AtenIpexCPUDev::dil_rnn_layer_backward(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2,
 
@@ -94,7 +94,7 @@ class AtenIpexCPUDev {
   static at::Tensor dil_shuffle(const at::Tensor & self, at::IntArrayRef view_shape, int64_t dim0, int64_t dim1);
   static std::tuple<at::Tensor,at::Tensor> dil__pack_padded_sequence(const at::Tensor & input, const at::Tensor & lengths, bool batch_first);
   static at::Tensor& dil_copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking);
-  static std::vector<at::Tensor> dil_rnn_layer(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2, const at::Tensor& w3, const at::Tensor& w4, const at::Tensor& hx, const at::Tensor& cx, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes);
+  static std::vector<at::Tensor> dil_rnn_layer(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2, const at::Tensor& w3, const at::Tensor& w4, const at::Tensor& hx, const at::Tensor& cx, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const std::vector<float>& scales, const std::vector<int32_t>& shift, bool quantized);
   static std::vector<at::Tensor> dil_rnn_layer_backward(const at::Tensor& input, const at::Tensor& w1, const at::Tensor& w2, const at::Tensor& w3, const at::Tensor& w4, const at::Tensor& hx, const at::Tensor& cx, const at::Tensor& output, const at::Tensor& hy, const at::Tensor& cy, const at::Tensor& grad_output, const at::Tensor& grad_hy, const at::Tensor& grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes);
   static at::Tensor dil_upsample_nearest1d(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales);
   static at::Tensor dil_upsample_nearest1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales);
 
@@ -3,9 +3,11 @@
 #include "CustomOPs.h"
 #include "DevOPs.h"
 #include "FusionOPs.h"
+#include "dbl/Common.h"
 #include "aten/aten.hpp"
 #include "bf16/vec/bf16_vec_kernel.h"
 #include "dil/dil.hpp"
+#include "torch_ipex/csrc/cpu/int8/Config.h"
 #include "xsmm/libxsmm_utils.h"
 #include <ATen/Parallel.h>
 #include <ATen/MatrixRef.h>
@@ -465,16 +467,19 @@ std::vector<at::Tensor> rnn_layer(const at::Tensor& input,
     at::TensorList weights, const at::Tensor& hx,
     const at::Tensor& cx, bool reverse, int64_t mode,
     int64_t hidden_size, int64_t num_layers, bool train,
-    bool bidirectional, at::IntArrayRef batch_sizes) {
+    bool bidirectional, at::IntArrayRef batch_sizes,
+    const std::vector<float>& scales,
+    const std::vector<int32_t>& shift, 
+    bool quantized) {
   TORCH_CHECK(weights.size() == 2 || weights.size() == 4);
   if (weights.size() == 4) {
     if (at::GradMode::is_enabled())
       return NewRNNLayerOp::apply(input, weights[0], weights[1], weights[2], weights[3], hx, cx, reverse, mode, hidden_size, num_layers, true, train, bidirectional, batch_sizes);
-    return NewRNNLayerOp::_forward(input, weights[0], weights[1], weights[2], weights[3], hx, cx, reverse, mode, hidden_size, num_layers, true, train, bidirectional, batch_sizes);
+    return NewRNNLayerOp::_forward(input, weights[0], weights[1], weights[2], weights[3], hx, cx, reverse, mode, hidden_size, num_layers, true, train, bidirectional, batch_sizes, scales, shift, quantized);
   } else {
     if (at::GradMode::is_enabled())
       return NewRNNLayerOp::apply(input, weights[0], weights[1], at::zeros(weights[0].sizes(), weights[0].options()), at::zeros(weights[1].sizes(), weights[1].options()), hx, cx, reverse, mode, hidden_size, num_layers, false, train, bidirectional, batch_sizes);
-    return NewRNNLayerOp::_forward(input, weights[0], weights[1], at::zeros(weights[0].sizes(), weights[0].options()), at::zeros(weights[1].sizes(), weights[1].options()), hx, cx, reverse, mode, hidden_size, num_layers, false, train, bidirectional, batch_sizes);
+    return NewRNNLayerOp::_forward(input, weights[0], weights[1], at::zeros(weights[0].sizes(), weights[0].options()), at::zeros(weights[1].sizes(), weights[1].options()), hx, cx, reverse, mode, hidden_size, num_layers, false, train, bidirectional, batch_sizes, scales, shift, quantized);
   }
 }
 // MKLDNN RNN integration notes:
@@ -514,6 +519,27 @@ std::vector<at::Tensor> rnn(
   at::MatrixRef<at::Tensor> weights{weight, static_cast<size_t>(weight_stride0)};
 
   auto num_directions = bidirectional ? 2 : 1;
+
+  // no need to do calibration for the output in lstm, will use the scale & zero point of the input
+  // to dequantize the output from u8 to f32, need to add an "output" here but actually unused
+  // For LSTM, we only need to calibrate the input to the first layer
+  // TODO: add int8 for gru and rnn. 
+  if (check_auto_mix_int8_fp32() && check_int8_calibration() && static_cast<dil::rnn_kind>(mode) == dil::rnn_kind::LSTM) {
+    int64_t num_ops_id = Int8OptConfig::fetch_and_add_ops_id();
+    insert_or_updata_observer({input}, {input}, "lstm", num_ops_id, /*asymmetric*/true);
+  }
+
+  bool quantized = false;
+  std::vector<std::vector<float>> scales = {};
+  std::vector<std::vector<int32_t>> shift = {};
+  if (check_auto_mix_int8_fp32() && !check_int8_calibration() && static_cast<dil::rnn_kind>(mode) == dil::rnn_kind::LSTM) {
+      int64_t num_ops_id = Int8OptConfig::fetch_and_add_ops_id();
+      quantized = torch_ipex::cpu::dbl::comm::get_int8_quantized_status(num_ops_id);
+      std::tie(scales, shift) = torch_ipex::cpu::dbl::comm::get_int8_asymmetric(num_ops_id);
+      IPEX_CHECK(scales.size() > 0, "incorrect scale size");
+      IPEX_CHECK(shift.size() > 0, "incorrect shift size");
+  }
+
   auto layer_input = input;
   std::vector<at::Tensor> layer_output(num_directions);
   std::vector<at::Tensor> layer_hy(num_layers * num_directions);
@@ -525,7 +551,7 @@ std::vector<at::Tensor> rnn(
       auto layer_hx = hx[index];
       auto layer_cx = cx[index];
       auto reverse = (direction > 0);
-      auto outputs = rnn_layer(layer_input, layer_weights, layer_hx, layer_cx, reverse, mode, hidden_size, num_layers, train, bidirectional, batch_sizes);
+      auto outputs = rnn_layer(layer_input, layer_weights, layer_hx, layer_cx, reverse, mode, hidden_size, num_layers, train, bidirectional, batch_sizes, scales[0], shift[0], quantized);
       layer_output[direction] = outputs[0];
       layer_hy[index] = outputs[1];
       layer_cy[index] = outputs[2];
 
@@ -227,6 +227,16 @@ struct ShadeDataContext {
     return res;
   }
 
+  static inline bool isTensorMixPrecision(const at::Tensor &tensor, MIX_PREC_TYPE mix_dtype) {
+    // Check whether tensor is mix_type.
+    void *raw_context = tensor.storage().data_ptr().get_context();
+    ShadeDataContext *shade_data_context = (ShadeDataContext*)raw_context;
+    if (shade_data_context->mix_prec_type == mix_dtype && mix_dtype != MIX_PREC_TYPE::NONE) {
+      return true;
+    }
+    return false;
+  }
+
   /**
    * Check if the input aten tensor is a parameter.
    *