intel
diff --git a/‎intel_pytorch_extension_py/ops/lstm.py
Lines changed: 12 additions & 11 deletions b/‎intel_pytorch_extension_py/ops/lstm.py
Lines changed: 12 additions & 11 deletions
diff --git a/‎scripts/cpu/gen-sparse-cpu-ops.py
Lines changed: 1 addition & 1 deletion b/‎scripts/cpu/gen-sparse-cpu-ops.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/cpu/test_bf16_lazy_reorder.py
Lines changed: 84 additions & 0 deletions b/‎tests/cpu/test_bf16_lazy_reorder.py
Lines changed: 84 additions & 0 deletions
diff --git a/‎torch_ipex/csrc/aten_ipex_bridge.cpp
Lines changed: 1 addition & 0 deletions b/‎torch_ipex/csrc/aten_ipex_bridge.cpp
Lines changed: 1 addition & 0 deletions
@@ -3,11 +3,12 @@
 
 VF_lstm = _VF.lstm
 
-def ipex_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first):
-    if input.device.type == 'xpu' and (dropout == 0 or training == False):
-        return torch.ops.torch_ipex.lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
+def ipex_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first, device):
+    # For LSTM training with dropout, fallback to cpu due to performance issue in oneDNN mode
+    if training and dropout != 0:
+        return fallback_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first, device=device)
     else:
-        return VF_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
+        return torch.ops.torch_ipex.lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
 # users may only transfer the data but not the module to IPEX device, need to check if every item in the args is on "cpu" device
 def get_device(*args):
@@ -45,14 +46,14 @@ def fallback_lstm(*args, device):
     return tuple(output_device)
 
 def lstm(*args):
+    device = get_device(*args)
+    if device == "cpu":
+        return VF_lstm(*args)
+    
+    # For LSTM with pack_padded_sequence as input, fallback to cpu due to performance issue in oneDNN mode
     if isinstance(args[1], torch.Tensor):
-        # For LSTM with pack_padded_sequence as input, fallback to cpu due to performance issue in oneDNN mode
-        device = get_device(*args)
-        if device == "cpu":
-            return VF_lstm(*args)
-        else:
-            return fallback_lstm(*args, device=device)
+        return fallback_lstm(*args, device=device)
     else:
-        return ipex_lstm(*args)
+        return ipex_lstm(*args, device=device)
 
 _VF.lstm = lstm
@@ -406,7 +406,7 @@ def gen_code(self):
                     if param.core_type in ['Tensor', 'Scalar']:
                         profiler_inputs.append(param.name)
                 code += '#if defined(IPEX_PROFILE_OP)\n'
-                code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{{input_names}}}));\n'.format(ns=_IPEX_OP_FUNC_NS, name=cpp_sparse_sig.def_name, input_names=', '.join(profiler_inputs))
+                code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{{input_names}}}));\n'.format(ns=_IPEX_OP_FUNC_NS, name=cpp_sparse_sig.def_name, input_names='')
                 code += '#endif\n'
 
                 code += self.gen_fallback_prepare_code(cpp_sparse_sig)
 
@@ -486,6 +486,90 @@ def test_batch_norm3d_backward(self):
                 self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_bf16.grad))
                 self.assertEqual(x_man_bf16.grad.float(), x_auto_mix_bf16.grad)
 
+class TestLayerNorm(TestCase):
+    def test_layer_norm(self):
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+
+        x_cpu, x_auto_mix_inference, x_auto_mix_train, x_man_bf16, x_auto_mix_train_bf16 = _gen_tensor(
+            rand_seed, (2, 5, 10, 10))
+
+        op_cpu, op_auto_mix_inference, op_auto_mix_train, op_man_bf16, op_auto_mix_train_bf16 = _gen_op(
+            rand_seed, torch.nn.LayerNorm([10, 10]), is_bn=True)
+
+        ref_cpu = op_cpu(x_cpu)
+        with AutoDNNL(True), AutoMixPrecision(False):
+            res_bf16 = op_man_bf16(x_man_bf16)
+            self.assertEqual(res_bf16.dtype, torch.bfloat16)
+
+            # FW inference
+            with AutoMixPrecision(True, train=False):
+                self.assertEqual(x_auto_mix_inference.dtype, torch.float)
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(x_auto_mix_inference))
+                res_auto_mix_inference = op_auto_mix_inference(x_auto_mix_inference)
+                self.assertEqual(res_auto_mix_inference.dtype, torch.float)
+                self.assertEqual(x_auto_mix_inference.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(res_auto_mix_inference))
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_inference))
+                self.assertEqual(res_bf16.float(), res_auto_mix_inference)
+
+            # FW train (input is not bf16 dil tensor)
+            with AutoMixPrecision(True, train=True):
+                self.assertEqual(x_auto_mix_train.dtype, torch.float)
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(x_auto_mix_train))
+                res_auto_mix_train = op_auto_mix_train(x_auto_mix_train)
+                self.assertEqual(res_auto_mix_train.dtype, torch.float)
+                self.assertEqual(x_auto_mix_train.dtype, torch.float)
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(res_auto_mix_train))
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(x_auto_mix_train))
+                self.assertEqual(ref_cpu, res_auto_mix_train)
+
+            # FW train (input is bf16 dil tensor)
+            with AutoMixPrecision(True, train=True):
+                self.assertEqual(x_auto_mix_train_bf16.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_train_bf16))
+                res_auto_mix_train_bf16 = op_auto_mix_train_bf16(x_auto_mix_train_bf16)
+                self.assertEqual(res_auto_mix_train_bf16.dtype, torch.float)
+                self.assertEqual(x_auto_mix_train_bf16.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(res_auto_mix_train_bf16))
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_train_bf16))
+                self.assertEqual(res_bf16.float(), res_auto_mix_train_bf16)
+
+    def test_layer_norm_backward(self):
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        x_cpu, _, x_auto_mix, x_man_bf16, x_auto_mix_bf16 = _gen_tensor(rand_seed, (2, 5, 10, 10), is_forward=False)
+
+        op_cpu, _, op_auto_mix, op_man_bf16, op_auto_mix_bf16 = _gen_op(rand_seed, torch.nn.LayerNorm([10, 10]), is_bn=True, is_forward=False)
+
+        out_cpu = op_cpu(x_cpu).sum()
+        out_cpu.backward()
+        with AutoDNNL(True), AutoMixPrecision(False, train=True):
+            out_man_bf16 = op_man_bf16(x_man_bf16).sum()
+            out_man_bf16.backward()
+            self.assertEqual(x_man_bf16.grad.dtype, torch.bfloat16)
+            self.assertEqual(x_cpu.grad.bfloat16().float(), x_man_bf16.grad, 1e-2)
+
+            # BW train (input is not bf16 dil tensor)
+            with AutoMixPrecision(True, train=True):
+                self.assertEqual(x_auto_mix.dtype, torch.float)
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(x_auto_mix))
+                out_auto_mix = op_auto_mix(x_auto_mix).sum()
+                out_auto_mix.backward()
+                self.assertEqual(x_auto_mix.grad.dtype, torch.float)
+                self.assertFalse(ipex.core.is_bf16_dil_tensor(x_auto_mix.grad))
+                self.assertEqual(x_cpu.grad, x_auto_mix.grad)
+
+             # BW train (input is bf16 dil tensor)
+            with AutoMixPrecision(True, train=True):
+                self.assertEqual(x_auto_mix_bf16.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_bf16))
+                out_auto_mix_bf16 = op_auto_mix_bf16(x_auto_mix_bf16).sum()
+                out_auto_mix_bf16.backward()
+                self.assertEqual(x_auto_mix_bf16.grad.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x_auto_mix_bf16.grad))
+                self.assertEqual(x_man_bf16.grad.float(), x_auto_mix_bf16.grad)
+
 class TestRelu(TestCase):
     def test_relu(self):
         rand_seed = int(get_rand_seed())
 
@@ -26,6 +26,7 @@ namespace bridge {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->dtype() == b.unsafeGetTensorImpl()->dtype()); \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_contiguous() == b.unsafeGetTensorImpl()->is_contiguous()); \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast) == b.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast)); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast3d) == b.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast3d)); \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_strides_like_channels_last() == b.unsafeGetTensorImpl()->is_strides_like_channels_last()); \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_non_overlapping_and_dense() == b.unsafeGetTensorImpl()->is_non_overlapping_and_dense()); \
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_wrapped_number() == b.unsafeGetTensorImpl()->is_wrapped_number()); \