intel
diff --git a/‎scripts/cpu/gen-dense-cpu-ops.py
Lines changed: 4 additions & 2 deletions b/‎scripts/cpu/gen-dense-cpu-ops.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/cpu/test_int8.py
Lines changed: 54 additions & 7 deletions b/‎tests/cpu/test_int8.py
Lines changed: 54 additions & 7 deletions
diff --git a/‎tests/cpu/test_lazy_reorder.py
Lines changed: 54 additions & 14 deletions b/‎tests/cpu/test_lazy_reorder.py
Lines changed: 54 additions & 14 deletions
@@ -95,7 +95,9 @@
     'aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor',
     'aten::upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor',
     'aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor',
+    'aten::upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor',
     'aten::upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor',
+    'aten::upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor',
     'aten::upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor',
     'aten::upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor',
     'aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)',
@@ -578,7 +580,7 @@ def is_conv_overrideable_func(fname):
 
             # Gen OP Name
             code += '#if defined(IPEX_DISP_OP)\n'
-            code += '  printf("{}::{}\\n");\n'.format(_IPEX_OP_FUNC_NS, cpp_sig.def_name)
+            code += '  printf("{}::{}\\n");\n'.format(_IPEX_OP_FUNC_NS, new_cpp_func_name)
             code += '#endif\n'
 
             # Gen profile info
@@ -587,7 +589,7 @@ def is_conv_overrideable_func(fname):
                 if param.core_type in ['Tensor', 'Scalar']:
                     profiler_inputs.append(param.name)
             code += '#if defined(IPEX_PROFILE_OP)\n'
-            code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{{input_names}}}));\n'.format(ns=_IPEX_OP_FUNC_NS, name=cpp_sig.def_name, input_names=', '.join(profiler_inputs))
+            code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{}}));\n'.format(ns=_IPEX_OP_FUNC_NS, name=new_cpp_func_name)
             code += '#endif\n'
 
             if is_conv_overrideable_func(cpp_sig.def_name):
 
@@ -8,6 +8,7 @@
 import itertools
 import time
 import json
+import sys
 
 import torch
 import torch.nn as nn
@@ -22,6 +23,9 @@
 
 from common_utils import TestCase
 
+def get_rand_seed():
+    return int(time.time() * 1000000000)
+
 device = ipex.DEVICE
 class TestQuantizationConfigueTune(TestCase):
     def test_quantization_status(self):
@@ -71,7 +75,7 @@ def test_quantization_status(self):
 
 
 class TestQuantization(TestCase):
-    def compare_fp32_int8(self, model, x):
+    def _compare_fp32_int8(self, model, x):
         conf = ipex.AmpConf(torch.int8)
         with ipex.AutoMixPrecision(conf, running_mode='calibration'):
             ref = model(x)
@@ -85,6 +89,25 @@ def compare_fp32_int8(self, model, x):
         self.assertEqual(ref, y, prec=0.1)
         os.remove('configure.json')
 
+    def _lstm_compare_fp32_int8(self, model, *args):
+        conf = ipex.AmpConf(torch.int8)
+        with ipex.AutoMixPrecision(conf, running_mode='calibration'):
+            with torch.no_grad():
+                ref, hy_ref = model(*args)
+        conf.save('configure.json')
+
+        conf = ipex.AmpConf(torch.int8, 'configure.json')
+        with ipex.AutoMixPrecision(conf, running_mode='inference'):
+            with torch.no_grad():
+                y, hy = model(*args)
+
+        self.assertTrue(ipex.core.is_int8_dil_tensor(y))
+
+        self.assertEqual(ref, y, prec=0.1)
+        self.assertEqual(hy_ref[0], hy[0], prec=0.01)
+        self.assertEqual(hy_ref[1], hy[1], prec=0.01)
+        os.remove('configure.json')
+
     def test_conv2d(self):
         options = itertools.product([1, 4], [True, False], [1, 2])
         for groups, bias, dilation in options:
@@ -100,12 +123,12 @@ def test_conv2d(self):
                                      dilation=dilation,
                                      bias=bias,
                                      groups=groups).float().to(device)
-            self.compare_fp32_int8(conv2d, x)
+            self._compare_fp32_int8(conv2d, x)
 
     def test_relu(self):
         x = torch.randn((4, 5), dtype=torch.float32).to(device)
         relu = nn.ReLU()
-        self.compare_fp32_int8(relu, x)
+        self._compare_fp32_int8(relu, x)
 
     def test_max_pool2d(self):
         N = torch.randint(3, 10, (1,)).item()
@@ -118,7 +141,7 @@ def test_max_pool2d(self):
                                               stride=stride,
                                               padding=1,
                                               ceil_mode=ceil_mode)
-                    self.compare_fp32_int8(max_pool2d, x)
+                    self._compare_fp32_int8(max_pool2d, x)
 
     def test_avg_pool2d(self):
         N = torch.randint(3, 10, (1,)).item()
@@ -131,15 +154,15 @@ def test_avg_pool2d(self):
                 stride=2,
                 padding=1,
                 count_include_pad=count_include_pad)
-            self.compare_fp32_int8(avg_pool2d, x)
+            self._compare_fp32_int8(avg_pool2d, x)
 
     def test_adaptive_avg_pool2d(self):
         N = torch.randint(3, 10, (1,)).item()
         C = torch.randint(3, 10, (1,)).item()
         x = torch.randn(N, C, 224, 224, dtype=torch.float32).to(device)
 
         adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(7)
-        self.compare_fp32_int8(adaptive_avg_pool2d, x)
+        self._compare_fp32_int8(adaptive_avg_pool2d, x)
 
     def test_linear(self):
         in_features = torch.randint(3, 10, (1,)).item()
@@ -148,8 +171,32 @@ def test_linear(self):
         for bias in [True, False]:
             x = torch.randn(3, in_features, dtype=torch.float32).to(device)
             linear = torch.nn.Linear(in_features, out_features, bias=bias).float().to(device)
-            self.compare_fp32_int8(linear, x)
+            self._compare_fp32_int8(linear, x)
+
+    def _lstm_int8(self, seq_len, batch_size, input_size, hidden_size, num_layers, bidirectional, bias, empty_state):
+        rand_seed = int(get_rand_seed())
+
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+
+        num_directions = 2 if bidirectional else 1
+
+        input_dpcpp = torch.FloatTensor(seq_len, batch_size, input_size).uniform_(-1, 1).to(device=device)
+        h0_dpcpp = torch.FloatTensor(num_layers * num_directions, batch_size, hidden_size).uniform_(-1, 1).to(device=device)
+        c0_dpcpp = torch.FloatTensor(num_layers * num_directions, batch_size, hidden_size).uniform_(-1, 1).to(device=device)
+        model_dpcpp = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, bias=bias).to(device=device).eval()
+
+        self._lstm_compare_fp32_int8(model_dpcpp, input_dpcpp)
 
+    def test_lstm(self):
+        self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=False, bias=True, empty_state=False)
+        
+        self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=True, bias=True, empty_state=False)
+        
+        self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=False, bias=False, empty_state=False)
+        
+        self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=True, bias=False, empty_state=False)
+    
 if __name__ == '__main__':
     rand_seed = int(time.time() * 1000000000)
     torch.manual_seed(rand_seed)
 
@@ -13,6 +13,8 @@
 import itertools
 import torch
 import intel_pytorch_extension as ipex
+import contextlib
+import io
 
 from common_ipex_conf import AutoMixPrecision, AutoDNNL
 
@@ -1303,6 +1305,33 @@ def test_unsqueeze(self):
             x_dpcpp = x.clone().to(device=device)
             self.assertEqual(x_dpcpp.unsqueeze(1), x.unsqueeze(1))
 
+        with AutoDNNL(True):
+            x = torch.randn(3, 64, 64, dtype=torch.float32)
+            x_xpu = x.clone().to(device=device)
+            conv2d_cpu = torch.nn.Conv2d(3, 6, (3, 3))
+            conv2d_xpu = copy.deepcopy(conv2d_cpu).to(device=device)
+            x_nchw = x.unsqueeze(0)
+            x_xpu_nchw = x_xpu.unsqueeze(0)
+            self.assertEqual(conv2d_cpu(x_nchw), conv2d_xpu(x_xpu_nchw))
+
+            conv2d_cpu = torch.nn.Conv2d(3, 1, (3, 3))
+            conv2d_xpu = copy.deepcopy(conv2d_cpu).to(ipex.DEVICE)
+            # reshape the conv2d weight to chw
+            conv2d_weight_seq = conv2d_xpu.weight.clone().squeeze()
+            # reshape the conv2d weight to nchw
+            conv2d_weight_unseq = torch.unsqueeze(conv2d_weight_seq, 0)
+
+            conv2d_xpu.weight.data = conv2d_weight_unseq
+
+            a = torch.randn(1, 3, 10, 10).to(ipex.DEVICE)
+            # Make sure the conv2d_xpu.weight is blocked format
+            conv2d_xpu(a)
+            # Make sure the unsqueeze does not trigger reorder
+            conv2d_weight_unseq = torch.unsqueeze(conv2d_weight_seq, 0)
+            self.assertEqual(conv2d_xpu(a), conv2d_cpu(a.to("cpu")))
+
+
+
 class TestSoftMax(TestCase):
     def test_softmax(self):
         with AutoDNNL(True):
@@ -1580,7 +1609,7 @@ def _lstm_params_list(self, cell):
         if cell == "RNN":
             params_dict["nonlinearity"] = ["tanh"] # ["tanh", "relu"] TODO relu has accuracy issue
         elif cell == "GRU":
-            params_dict["nonlinearity"] = [""] 
+            params_dict["nonlinearity"] = [""]
 
         params_list = []
 
@@ -1592,16 +1621,16 @@ def _test_lstm(self, training):
         rand_seed = int(get_rand_seed())
         print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
         torch.manual_seed(rand_seed)
-        
+
         params_list = self._lstm_params_list("LSTM")
 
         for input_size, hidden_size, num_layers, bidirectional, bias, empty_state, batch_first, dropout, batch_size, seq_len in itertools.product(*params_list):
             # dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1
             if dropout > 0 and num_layers == 1:
                 continue
-            
+
             num_directions = 2 if bidirectional else 1
-            
+
             if batch_first:
                 input = torch.randn(batch_size, seq_len, input_size)
             else:
@@ -1649,7 +1678,7 @@ def _test_lstm(self, training):
                     hy_cpu[0].sum().backward(retain_graph=True)
                     hy_dpcpp[0].sum().backward(retain_graph=True)
                     self.assertEqual(h0_dpcpp.grad.to('cpu'), h_cpu.grad)
-                    
+
                     hy_cpu[1].sum().backward(retain_graph=True)
                     hy_dpcpp[1].sum().backward(retain_graph=True)
                     self.assertEqual(c0_dpcpp.grad.to('cpu'), c_cpu.grad)
@@ -1658,16 +1687,16 @@ def _test_rnn(self, cell, training):
         rand_seed = int(get_rand_seed())
         print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
         torch.manual_seed(rand_seed)
-        
+
         params_list = self._lstm_params_list(cell)
 
         for input_size, hidden_size, num_layers, bidirectional, bias, empty_state, batch_first, dropout, batch_size, seq_len, nonlinearity in itertools.product(*params_list):
             # dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1
             if dropout > 0 and num_layers == 1:
                 continue
-            
+
             num_directions = 2 if bidirectional else 1
-            
+
             if batch_first:
                 input = torch.randn(batch_size, seq_len, input_size)
             else:
@@ -1683,7 +1712,7 @@ def _test_rnn(self, cell, training):
                 model_cpu = torch.nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, bias=bias, dropout=dropout, batch_first=batch_first, nonlinearity=nonlinearity)
             elif cell == "GRU":
                 model_cpu = torch.nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, bias=bias, dropout=dropout, batch_first=batch_first)
-                
+
             model_cpu.train() if training else model_cpu.eval()
 
             input_dpcpp = input.clone().to(device=device).requires_grad_(training)
@@ -1720,7 +1749,7 @@ def _test_pack_padded_sequence_lstm(self, training):
         rand_seed = int(get_rand_seed())
         print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
         torch.manual_seed(rand_seed)
-        
+
         embedding_dim = 1024
         hidden_dim = 10
         batch_size = 24
@@ -1755,7 +1784,7 @@ def _test_pack_padded_sequence_lstm(self, training):
 
         lstm_out, hidden_out = lstm(embeds, (hidden_0, hidden_1))
         lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
-        
+
         with AutoDNNL(True):
             lstm_out_dpcpp, hidden_out_dpcpp = lstm_dpcpp(embeds_dpcpp, (hidden_0_dpcpp, hidden_1_dpcpp))
             lstm_out_dpcpp, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out_dpcpp, batch_first=True)
@@ -1770,16 +1799,16 @@ def _test_pack_padded_sequence_lstm(self, training):
                 self.assertEqual(sentences_dpcpp.grad.to('cpu'), sentences.grad)
                 self.assertEqual(lstm_dpcpp.weight_ih_l0.grad.to('cpu'), lstm.weight_ih_l0.grad)
                 self.assertEqual(lstm_dpcpp.weight_hh_l0.grad.to('cpu'), lstm.weight_hh_l0.grad)
-                    
+
                 self.assertEqual(lstm_dpcpp.bias_ih_l0.grad.to('cpu'), lstm.bias_ih_l0.grad)
                 self.assertEqual(lstm_dpcpp.bias_hh_l0.grad.to('cpu'), lstm.bias_hh_l0.grad)
-                
+
                 self.assertEqual(hidden_0_dpcpp.grad.to('cpu'), hidden_0.grad)
                 self.assertEqual(hidden_1_dpcpp.grad.to('cpu'), hidden_1.grad)
 
     def test_lstm_inference(self):
         self._test_lstm(training=False)
-    
+
     def test_lstm_training(self):
         self._test_lstm(training=True)
 
@@ -1937,6 +1966,17 @@ def test_upsample_bilinear2d_scale_factor(self):
             y_dpcpp.sum().backward()
             self.assertEqual(x_cpu.grad, x_dpcpp.grad)
 
+        with AutoDNNL(True):
+            x = torch.randn(2, 2, 4, 4)
+            x_cpu = x.clone().requires_grad_()
+            x_dpcpp = x.clone().to(device=device).requires_grad_()
+            y_cpu = F.interpolate(x_cpu, scale_factor = [2, 3], mode='bilinear', align_corners=False, recompute_scale_factor=False)
+            y_dpcpp = F.interpolate(x_dpcpp, scale_factor = [2, 3], mode='bilinear', align_corners=False, recompute_scale_factor=False)
+            self.assertEqual(y_cpu, y_dpcpp)
+            y_cpu.sum().backward()
+            y_dpcpp.sum().backward()
+            self.assertEqual(x_cpu.grad, x_dpcpp.grad)
+
     def test_upsample_bilinear2d_size(self):
         rand_seed = int(get_rand_seed())
         print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))