diff --git a/cmake/CPU.cmake b/cmake/CPU.cmake
index 08a246f35..8ae17fb3e 100644
--- a/cmake/CPU.cmake
+++ b/cmake/CPU.cmake
@@ -27,7 +27,7 @@ IF(CMAKE_BUILD_TYPE MATCHES Debug)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -D_DEBUG")
 ELSE()
   message("Release build.")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG")
 ENDIF()
 
 # ---[ Build flags
diff --git a/scripts/cpu/gen-dense-cpu-ops.py b/scripts/cpu/gen-dense-cpu-ops.py
index 9784e8910..4096416fc 100755
--- a/scripts/cpu/gen-dense-cpu-ops.py
+++ b/scripts/cpu/gen-dense-cpu-ops.py
@@ -304,7 +304,7 @@ def is_out_func(fname):
                 param_seq_str = param_var
                 if param_var in dnnl_tensor_param_vars:
                     if param_var == 'out' and is_out_func(fname):
-                        code += '      TORCH_INTERNAL_ASSERT({}.is_contiguous());\n'.format(param_var)
+                        code += '      TORCH_INTERNAL_ASSERT_DEBUG_ONLY({}.is_contiguous());\n'.format(param_var)
                     else:
                         param_seq_str = '{}.is_contiguous() ? {} : {}.contiguous()'.format(param_var, param_var, param_var)
                 param_seq_str_vec.append(param_seq_str)
@@ -334,10 +334,10 @@ def gen_fallback_prepare_code(self, cpp_sig):
                 ipex_name = '_ipex_{}'.format(param.name)
                 param.ipex_name = ipex_name
                 check_cond = '{}.device().type() == at::DeviceType::DPCPP'.format(param.name)
-                op_check_code += '  TORCH_INTERNAL_ASSERT({});\n'.format(check_cond)
+                op_check_code += '  TORCH_INTERNAL_ASSERT_DEBUG_ONLY({});\n'.format(check_cond)
                 code += '  at::TensorOptions {} = {}.device(at::DeviceType::CPU);\n'.format(ipex_name, param.name)
             elif param.core_type == 'Storage':
-                code += '  TORCH_INTERNAL_ASSERT({}.device_type() == c10::DeviceType::DPCPP);\n'.format(param.name)
+                code += '  TORCH_INTERNAL_ASSERT_DEBUG_ONLY({}.device_type() == c10::DeviceType::DPCPP);\n'.format(param.name)
             elif param.core_type == 'MemoryFormat':
                 if param.is_optional:
                     check_cond = '{}.value_or(c10::MemoryFormat::Contiguous) != c10::MemoryFormat::Contiguous'.format(param.name)
@@ -352,7 +352,7 @@ def gen_fallback_prepare_code(self, cpp_sig):
                 assert param.core_type == 'Tensor'
                 ipex_name = '_ipex_{}'.format(param.name)
                 check_cond = '{}.layout() == c10::kStrided'.format(param.name)
-                op_check_code += '  TORCH_INTERNAL_ASSERT({});\n'.format(check_cond)
+                op_check_code += '  TORCH_INTERNAL_ASSERT_DEBUG_ONLY({});\n'.format(check_cond)
                 code += '  auto&& {} = bridge::{}({});\n'.format(ipex_name, _SHALLOW_FALLBACK_TO_CPU_TENSOR, param.name)
                 param.ipex_name = ipex_name
         return op_check_code + code
diff --git a/scripts/cpu/gen-sparse-cpu-ops.py b/scripts/cpu/gen-sparse-cpu-ops.py
index bd4a014f7..3f99c9e18 100755
--- a/scripts/cpu/gen-sparse-cpu-ops.py
+++ b/scripts/cpu/gen-sparse-cpu-ops.py
@@ -260,10 +260,10 @@ def gen_fallback_prepare_code(self, cpp_sig):
                 ipex_name = '_ipex_{}'.format(param.name)
                 param.ipex_name = ipex_name
                 check_cond = '{}.device().type() == at::DeviceType::DPCPP'.format(param.name)
-                op_check_code += '  TORCH_INTERNAL_ASSERT({});\n'.format(check_cond)
+                op_check_code += '  TORCH_INTERNAL_ASSERT_DEBUG_ONLY({});\n'.format(check_cond)
                 code += '  at::TensorOptions {} = {}.device(at::DeviceType::CPU);\n'.format(ipex_name, param.name)
             elif param.core_type == 'Storage':
-                code += '  TORCH_INTERNAL_ASSERT({}.device_type() == c10::DeviceType::DPCPP);\n'.format(param.name)
+                code += '  TORCH_INTERNAL_ASSERT_DEBUG_ONLY({}.device_type() == c10::DeviceType::DPCPP);\n'.format(param.name)
             elif param.core_type == 'MemoryFormat':
                 None
             elif param.core_type != 'Tensor':
diff --git a/torch_ipex/csrc/aten_ipex_bridge.cpp b/torch_ipex/csrc/aten_ipex_bridge.cpp
index 62b187c98..266955201 100644
--- a/torch_ipex/csrc/aten_ipex_bridge.cpp
+++ b/torch_ipex/csrc/aten_ipex_bridge.cpp
@@ -21,26 +21,26 @@ namespace bridge {
 
 #if defined(_DEBUG)
 #define CHECK_TENSOR(a, b) \
-  TORCH_INTERNAL_ASSERT(a.numel() == b.numel()); \
-  TORCH_INTERNAL_ASSERT(a.dtype() == b.dtype()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->sizes() == b.unsafeGetTensorImpl()->sizes()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->dtype() == b.unsafeGetTensorImpl()->dtype()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->is_contiguous() == b.unsafeGetTensorImpl()->is_contiguous()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast) == b.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast)); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->is_strides_like_channels_last() == b.unsafeGetTensorImpl()->is_strides_like_channels_last()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->is_non_overlapping_and_dense() == b.unsafeGetTensorImpl()->is_non_overlapping_and_dense()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->is_wrapped_number() == b.unsafeGetTensorImpl()->is_wrapped_number()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->version_counter().current_version() == b.unsafeGetTensorImpl()->version_counter().current_version()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->allow_tensor_metadata_change() == b.unsafeGetTensorImpl()->allow_tensor_metadata_change())
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.numel() == b.numel()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.dtype() == b.dtype()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->sizes() == b.unsafeGetTensorImpl()->sizes()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->dtype() == b.unsafeGetTensorImpl()->dtype()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_contiguous() == b.unsafeGetTensorImpl()->is_contiguous()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast) == b.unsafeGetTensorImpl()->is_contiguous(at::MemoryFormat::ChannelsLast)); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_strides_like_channels_last() == b.unsafeGetTensorImpl()->is_strides_like_channels_last()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_non_overlapping_and_dense() == b.unsafeGetTensorImpl()->is_non_overlapping_and_dense()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->is_wrapped_number() == b.unsafeGetTensorImpl()->is_wrapped_number()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->version_counter().current_version() == b.unsafeGetTensorImpl()->version_counter().current_version()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->allow_tensor_metadata_change() == b.unsafeGetTensorImpl()->allow_tensor_metadata_change())
 #else
 #define CHECK_TENSOR(a, b) ((void) 0)
 #endif
 
 #if defined(_DEBUG)
 #define CHECK_TENSOR_CRITICAL(a, b, may_alias) \
-  TORCH_INTERNAL_ASSERT(!may_alias || a.data_ptr() == b.data_ptr()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->strides() == b.unsafeGetTensorImpl()->strides()); \
-  TORCH_INTERNAL_ASSERT(a.unsafeGetTensorImpl()->storage_offset() == b.unsafeGetTensorImpl()->storage_offset()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!may_alias || a.data_ptr() == b.data_ptr()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->strides() == b.unsafeGetTensorImpl()->strides()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.unsafeGetTensorImpl()->storage_offset() == b.unsafeGetTensorImpl()->storage_offset()); \
   CHECK_TENSOR(a, b)
 #else
 #define CHECK_TENSOR_CRITICAL(a, b, may_alias) ((void) 0)
@@ -48,12 +48,12 @@ namespace bridge {
 
 #if defined(_DEBUG)
 #define CHECK_SPARSE_TENSOR_CRITICAL(a, b, may_alias) \
-  TORCH_INTERNAL_ASSERT(!may_alias || a._indices().data_ptr() == b._indices().data_ptr()); \
-  TORCH_INTERNAL_ASSERT(!may_alias || a._values().data_ptr() == b._values().data_ptr()); \
-  TORCH_INTERNAL_ASSERT(a.sparse_dim() == b.sparse_dim()); \
-  TORCH_INTERNAL_ASSERT(a.dense_dim() == b.dense_dim()); \
-  TORCH_INTERNAL_ASSERT(a._nnz() == b._nnz()); \
-  TORCH_INTERNAL_ASSERT(a.is_coalesced() == b.is_coalesced()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!may_alias || a._indices().data_ptr() == b._indices().data_ptr()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!may_alias || a._values().data_ptr() == b._values().data_ptr()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.sparse_dim() == b.sparse_dim()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.dense_dim() == b.dense_dim()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a._nnz() == b._nnz()); \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.is_coalesced() == b.is_coalesced()); \
   CHECK_TENSOR(a._indices(), b._indices()); \
   CHECK_TENSOR(a._values(), b._values())
 #else
@@ -66,21 +66,21 @@ void reorderDilTensorToPublic(const at::Tensor& ipexTensor) {
   void *data_ctx = ipexTensor.unsafeGetTensorImpl()->storage().data_ptr().get_context();
   cpu::ShadeDataContext *shade_data_context = (cpu::ShadeDataContext*)data_ctx;
 #if defined(_DEBUG)
-  TORCH_INTERNAL_ASSERT(! (shade_data_context->dil_tensor.is_empty()));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (shade_data_context->dil_tensor.is_empty()));
 #endif
   dil::tensor &dil_tensor = shade_data_context->dil_tensor;
 
   if (dil_tensor.is_public_format()) {
 #if defined(_DEBUG)
-    TORCH_INTERNAL_ASSERT(shade_data_context->cpu_raw_data == shade_data_context->dil_tensor.get_data_handle());
-    TORCH_INTERNAL_ASSERT(shade_data_context->cpu_raw_data != nullptr);
-    TORCH_INTERNAL_ASSERT(shade_data_context->cpu_del_fun != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->cpu_raw_data == shade_data_context->dil_tensor.get_data_handle());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->cpu_raw_data != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->cpu_del_fun != nullptr);
 #endif
   } else {
 #if defined(_DEBUG)
     auto& data_ptr = ipexTensor.storage().unsafeGetStorageImpl()->data_ptr();
-    TORCH_INTERNAL_ASSERT(data_ptr.get_deleter() == &(cpu::ShadeDataContext::freeShadeDataContext));
-    TORCH_INTERNAL_ASSERT(shade_data_context->cpu_del_fun == nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_ptr.get_deleter() == &(cpu::ShadeDataContext::freeShadeDataContext));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->cpu_del_fun == nullptr);
 #endif
     auto pub_tensor = dil_tensor.to_public(nullptr, dil_tensor.get_data_type());
 
@@ -116,7 +116,7 @@ void attachShadeDataConext(const at::Tensor& tensor) {
 
   auto cur_del_fn = data_ptr.get_deleter();
   bool res = data_ptr.compare_exchange_deleter(cur_del_fn, &(c10::detail::deleteNothing));
-  TORCH_INTERNAL_ASSERT(res);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(res);
   // Make sure that does not triger free resource for set_ptr
   cpu::ShadeDataContext *shade_data_context = cpu::ShadeDataContext::allocShadeDataContext();
   shade_data_context->cpu_raw_data = data_ptr.get();
@@ -131,44 +131,6 @@ void attachShadeDataConext(const at::Tensor& tensor) {
 }
 
 
-// Fallback DPCPP tensor to CPU Tensor.
-// It will allocate new memory buffer and then duplicate the DPCPP tensor buffer to create new CPU Tensor
-at::Tensor fallbackToCPUTensor(const at::Tensor& ipexTensor) {
-  TORCH_INTERNAL_ASSERT(ipexTensor.defined());
-  TORCH_INTERNAL_ASSERT(!ipexTensor.is_sparse());
-  TORCH_INTERNAL_ASSERT(ipexTensor.is_contiguous());
-  TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(ipexTensor.device().type() == at::DeviceType::DPCPP);
-  if (ipexTensor.device().is_cpu())
-    return ipexTensor;
-
-  if (ipexTensor.device().type() != at::DeviceType::DPCPP) {
-    assert(false);
-  }
-
-  auto* allocator = c10::GetAllocator(c10::DeviceType::CPU);
-  int64_t nelements = ipexTensor.numel();
-  auto dtype = ipexTensor.dtype();
-  int64_t data_size = nelements * dtype.itemsize();
-  auto storage_impl = c10::make_intrusive<at::StorageImpl>(
-    dtype,
-    nelements,
-    allocator->allocate(data_size),
-    allocator,
-    /*resizeable=*/true);
-  memcpy(storage_impl->data(), ipexTensor.unsafeGetTensorImpl()->data(), data_size);
-
-  auto _tensor =  at::detail::make_tensor<at::TensorImpl>(storage_impl, at::DispatchKey::CPUTensorId);
-  IPEXTensorImpl::CopyMetadata(_tensor.unsafeGetTensorImpl(), ipexTensor.unsafeGetTensorImpl());
-  auto _tensor_sizes = ipexTensor.sizes();
-  if (_tensor_sizes.size() != 1 || _tensor_sizes[0] != 0) {
-    _tensor.unsafeGetTensorImpl()->set_sizes_contiguous(_tensor_sizes);
-  }
-  CHECK_TENSOR(_tensor, ipexTensor);
-  return _tensor;
-}
-
-
 // Unpack CPU tensor from ipex tensor and return to caller directly
 //at::Tensor shallowFallbackToCPUShadeTensor(const at::Tensor& ipexTensor) {
 at::Tensor shallowFallbackToCPUTensor(const at::Tensor& ipexTensor) {
@@ -177,13 +139,13 @@ at::Tensor shallowFallbackToCPUTensor(const at::Tensor& ipexTensor) {
   }
 
   if (ipexTensor.device().is_cpu()) {
-    TORCH_INTERNAL_ASSERT(! (ipexTensor.key_set().has(at::DispatchKey::DPCPPTensorId)));
-    TORCH_INTERNAL_ASSERT(! (ipexTensor.key_set().has(at::DispatchKey::SparseDPCPPTensorId)));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (ipexTensor.key_set().has(at::DispatchKey::DPCPPTensorId)));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (ipexTensor.key_set().has(at::DispatchKey::SparseDPCPPTensorId)));
     return ipexTensor;
   }
 
-  TORCH_INTERNAL_ASSERT(ipexTensor.device().is_dpcpp());
-  TORCH_INTERNAL_ASSERT(
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.device().is_dpcpp());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
     ipexTensor.key_set().has(at::DispatchKey::DPCPPTensorId) ||
     ipexTensor.key_set().has(at::DispatchKey::SparseDPCPPTensorId));
 
@@ -204,7 +166,7 @@ at::Tensor shallowFallbackToCPUTensor(const at::Tensor& ipexTensor) {
     return shallowFallbackToCPUTensorImpl(ipexTensor);
   }
 
-  TORCH_INTERNAL_ASSERT(data_ctx != nullptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_ctx != nullptr);
   cpu::ShadeDataContext *shade_data_context = (cpu::ShadeDataContext*)data_ctx;
   // Branch 2.1: Dense + Dil Tensor
   if (cpu::ShadeDataContext::isDilTensor(ipexTensor)) {
@@ -224,16 +186,16 @@ at::Tensor shallowFallbackToCPUTensorImpl(const at::Tensor& ipexTensor) {
   }
 
   if (ipexTensor.device().is_cpu()) {
-    TORCH_INTERNAL_ASSERT(! (ipexTensor.key_set().has(at::DispatchKey::DPCPPTensorId)));
-    TORCH_INTERNAL_ASSERT(! (ipexTensor.key_set().has(at::DispatchKey::SparseDPCPPTensorId)));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (ipexTensor.key_set().has(at::DispatchKey::DPCPPTensorId)));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (ipexTensor.key_set().has(at::DispatchKey::SparseDPCPPTensorId)));
     return ipexTensor;
   }
 
   if (ipexTensor.is_sparse()) {
-    TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kSparse);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.layout() == c10::kSparse);
     // [NOTE]: Use _indices and _values interfaces to bypass non-coalesced check
-    TORCH_INTERNAL_ASSERT(ipexTensor._indices().layout() == c10::kStrided);
-    TORCH_INTERNAL_ASSERT(ipexTensor._values().layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor._indices().layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor._values().layout() == c10::kStrided);
 
     auto&& cpu_indices = shallowFallbackToCPUTensorImpl(ipexTensor._indices());
     auto&& cpu_values = shallowFallbackToCPUTensorImpl(ipexTensor._values());
@@ -249,9 +211,9 @@ at::Tensor shallowFallbackToCPUTensorImpl(const at::Tensor& ipexTensor) {
     return _tensor;
   } else {
     auto *ipex_tensor_impl = ipexTensor.unsafeGetTensorImpl();
-    TORCH_INTERNAL_ASSERT(ipex_tensor_impl != nullptr);
-    TORCH_INTERNAL_ASSERT(ipex_tensor_impl->has_storage());
-    TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor_impl != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor_impl->has_storage());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.layout() == c10::kStrided);
     auto ipex_tensor_storage = ipex_tensor_impl->storage().unsafeGetStorageImpl();
     ipex_tensor_storage->data_ptr().unsafe_set_device(c10::Device(at::DeviceType::CPU));
     auto _tensor = at::detail::make_tensor<IPEXTensorImpl>(ipexTensor.storage(), at::DispatchKey::CPUTensorId);
@@ -266,39 +228,6 @@ at::Tensor shallowFallbackToCPUTensorImpl(const at::Tensor& ipexTensor) {
 }
 
 
-// Upgrade CPU tensor to DPCPP Tensor.
-// It will allocate new memory buffer and then duplicate the CPU tensor buffer to create new DPCPP Tensor
-at::Tensor upgradeToDPCPPTensor(const at::Tensor& cpuTensor) {
-  TORCH_INTERNAL_ASSERT(cpuTensor.defined());
-  TORCH_INTERNAL_ASSERT(!cpuTensor.is_sparse());
-  TORCH_INTERNAL_ASSERT(cpuTensor.is_contiguous());
-  TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(cpuTensor.device().type() == at::DeviceType::CPU);
-  if (cpuTensor.device().type() == at::DeviceType::DPCPP) {
-    return cpuTensor;
-  }
-
-  auto* allocator = c10::GetAllocator(c10::DeviceType::DPCPP);
-  int64_t nelements = cpuTensor.numel();
-  auto dtype = cpuTensor.dtype();
-  int64_t data_size = nelements * dtype.itemsize();
-  auto storage_impl = c10::make_intrusive<at::StorageImpl>(
-    dtype,
-    nelements,
-    allocator->allocate(data_size),
-    allocator,
-    /*resizeable=*/true);
-  memcpy(storage_impl->data(), cpuTensor.unsafeGetTensorImpl()->data(), data_size);
-  auto&& _tensor = at::detail::make_tensor<IPEXTensorImpl>(storage_impl, at::DispatchKey::DPCPPTensorId);
-  auto _tensor_sizes = cpuTensor.sizes();
-  if (_tensor_sizes.size() != 1 || _tensor_sizes[0] != 0) {
-    _tensor.unsafeGetTensorImpl()->set_sizes_contiguous(_tensor_sizes);
-  }
-  IPEXTensorImpl::CopyMetadata(_tensor.unsafeGetTensorImpl(), cpuTensor.unsafeGetTensorImpl());
-  CHECK_TENSOR(_tensor, cpuTensor);
-  return _tensor;
-}
-
 // Upgrade CPU tensor to DPCPP Tensor with shallow copy
 // It will create an new DPCPP tensor but shares CPU tensor buffer
 // [NOTE]: Device info of Dense CPU tensor is polluted.
@@ -307,12 +236,12 @@ at::Tensor shallowUpgradeToDPCPPTensor(const at::Tensor& cpuTensor) {
     return at::Tensor();
   }
 
-  TORCH_INTERNAL_ASSERT(cpuTensor.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.device().type() == at::DeviceType::CPU);
   if (cpuTensor.is_sparse()) {
-    TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kSparse);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.layout() == c10::kSparse);
     // [NOTE]: Use _indices and _values interfaces to bypass non-coalesced check
-    TORCH_INTERNAL_ASSERT(cpuTensor._indices().layout() == c10::kStrided);
-    TORCH_INTERNAL_ASSERT(cpuTensor._values().layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor._indices().layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor._values().layout() == c10::kStrided);
     auto&& ipex_indices = shallowUpgradeToDPCPPTensor(cpuTensor._indices());
     auto&& ipex_values = shallowUpgradeToDPCPPTensor(cpuTensor._values());
     // Create ipex sparse tensor and copy meta data from cpu sparse tensor
@@ -327,19 +256,19 @@ at::Tensor shallowUpgradeToDPCPPTensor(const at::Tensor& cpuTensor) {
     return _tensor;
   } else {
     auto *cpu_tensor_impl = cpuTensor.unsafeGetTensorImpl();
-    TORCH_INTERNAL_ASSERT(cpu_tensor_impl != nullptr);
-    TORCH_INTERNAL_ASSERT(cpu_tensor_impl->has_storage());
-    TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpu_tensor_impl != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpu_tensor_impl->has_storage());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.layout() == c10::kStrided);
 
     auto cpu_storage = cpu_tensor_impl->storage().unsafeGetStorageImpl();
     // [NOTE]: If the deleter of DPCPP::CPU is different form CPU deleter, we need to call
     //         compare_exchange_deleter of DataPtr to update deleter
     cpu_storage->data_ptr().unsafe_set_device(c10::Device(at::DeviceType::DPCPP));
     auto _tensor =  at::detail::make_tensor<IPEXTensorImpl>(cpuTensor.storage(), at::DispatchKey::DPCPPTensorId);
-    TORCH_INTERNAL_ASSERT(_tensor.device().type() == at::DeviceType::DPCPP);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_tensor.device().type() == at::DeviceType::DPCPP);
     IPEXTensorImpl* ipex_impl = (IPEXTensorImpl *)_tensor.unsafeGetTensorImpl();
     ipex_impl->copy_meta_info(cpu_tensor_impl);
-    TORCH_INTERNAL_ASSERT(! cpuTensor.requires_grad());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! cpuTensor.requires_grad());
     CHECK_TENSOR_CRITICAL(_tensor, cpuTensor, true);
     //TODO: Cannot set reserved_ 
     //      dest_impl->reserved_ = src_impl->reserved_;
@@ -350,25 +279,25 @@ at::Tensor shallowUpgradeToDPCPPTensor(const at::Tensor& cpuTensor) {
 
 
 at::Tensor shallowUpgradeToDPCPPTensorA(const at::Tensor& ipexTensor, const at::Tensor& cpuTensor) {
-  TORCH_INTERNAL_ASSERT(ipexTensor.defined());
-  TORCH_INTERNAL_ASSERT(cpuTensor.defined());
-  TORCH_INTERNAL_ASSERT(!ipexTensor.is_sparse());
-  TORCH_INTERNAL_ASSERT(!cpuTensor.is_sparse());
-  TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(ipexTensor.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!ipexTensor.is_sparse());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!cpuTensor.is_sparse());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.device().type() == at::DeviceType::DPCPP);
 
   ipexTensor.unsafeGetTensorImpl()->storage().unsafeGetStorageImpl()->data_ptr().unsafe_set_device(c10::Device(at::DeviceType::DPCPP));
 
-  TORCH_INTERNAL_ASSERT(ipexTensor.storage().device_type() == at::DeviceType::DPCPP);
-  TORCH_INTERNAL_ASSERT(cpuTensor.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.storage().device_type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.device().type() == at::DeviceType::CPU);
 
   // The ipexTensor and cpuTensor shares same storage.
-  TORCH_INTERNAL_ASSERT(cpuTensor.storage().device_type() == at::DeviceType::DPCPP);
-  TORCH_INTERNAL_ASSERT(ipexTensor.storage().data() == cpuTensor.storage().data());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.storage().device_type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.storage().data() == cpuTensor.storage().data());
 
   auto _tensor = at::detail::make_tensor<IPEXTensorImpl>(at::Storage(ipexTensor.storage()), at::DispatchKey::DPCPPTensorId);
-  TORCH_INTERNAL_ASSERT(_tensor.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_tensor.device().type() == at::DeviceType::DPCPP);
   IPEXTensorImpl* ipex_impl = (IPEXTensorImpl *)_tensor.unsafeGetTensorImpl();
   ipex_impl->copy_meta_info(cpuTensor.unsafeGetTensorImpl());
   ipex_impl->copy_auto_grad(cpuTensor.unsafeGetTensorImpl());
@@ -382,25 +311,25 @@ at::Tensor shallowUpgradeToDPCPPTensorA(const at::Tensor& ipexTensor, const at::
 // Upgrade CPU tensor to DPCPP Tensor with shallow copy
 // It will not create an new DPCPP tensor but shares CPU tensor buffer
 const at::Tensor& shallowUpgradeToDPCPPTensorAW(const at::Tensor& ipexTensor, const at::Tensor& cpuTensor) {
-  TORCH_INTERNAL_ASSERT(ipexTensor.defined());
-  TORCH_INTERNAL_ASSERT(cpuTensor.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.defined());
 
   // The dispatch priority of DPCPPTensorId is higher than other CPU tensor ids. So if a tensor is CPU and
   // another tensor is DPCPP, it still will be disptached to DPCPP OPs.
   //   ex, a = tensor(1, device='dpcpp')), a.to('cpu')
   // The above code will call AtenIpexCPUDefault::copy_ and "self" parameter is cpu tensor and "src" parameter is dpcpp tensor.
   if (ipexTensor.device().type() == cpuTensor.device().type()) {
-    TORCH_INTERNAL_ASSERT(cpuTensor.device().type() == at::DeviceType::CPU);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.device().type() == at::DeviceType::CPU);
     return ipexTensor;
   }
 
-  TORCH_INTERNAL_ASSERT(cpuTensor.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(ipexTensor.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.device().type() == at::DeviceType::DPCPP);
 
   if (ipexTensor.is_sparse()) {
-    TORCH_INTERNAL_ASSERT(cpuTensor.is_sparse());
-    TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kSparse);
-    TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kSparse);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.is_sparse());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.layout() == c10::kSparse);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.layout() == c10::kSparse);
     // NOTICE:
     // In PyTorch, alias semantics is `may alias`, not `must alias`.
     // e.g. some sparse 'alias marked' ops are not alias actually,
@@ -426,10 +355,10 @@ const at::Tensor& shallowUpgradeToDPCPPTensorAW(const at::Tensor& ipexTensor, co
     CHECK_SPARSE_TENSOR_CRITICAL(ipexTensor, cpuTensor, may_alias);
     return ipexTensor;
   } else {
-    TORCH_INTERNAL_ASSERT(!ipexTensor.is_sparse());
-    TORCH_INTERNAL_ASSERT(!cpuTensor.is_sparse());
-    TORCH_INTERNAL_ASSERT(ipexTensor.layout() == c10::kStrided);
-    TORCH_INTERNAL_ASSERT(cpuTensor.layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!ipexTensor.is_sparse());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!cpuTensor.is_sparse());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpuTensor.layout() == c10::kStrided);
 
     auto ipex_tensor_storage_impl = ipexTensor.unsafeGetTensorImpl()->storage().unsafeGetStorageImpl();
     auto cpu_tensor_storage_impl = cpuTensor.unsafeGetTensorImpl()->storage().unsafeGetStorageImpl();
@@ -441,7 +370,7 @@ const at::Tensor& shallowUpgradeToDPCPPTensorAW(const at::Tensor& ipexTensor, co
       ipexTensor.unsafeGetTensorImpl()->set_storage(cpuTensor.storage());
     }
 
-    TORCH_INTERNAL_ASSERT(ipexTensor.data_ptr() == cpuTensor.data_ptr());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipexTensor.data_ptr() == cpuTensor.data_ptr());
 
     // NOTE: Cannot set storage data_ptr by set_data_ptr.
     //       set_data_ptr will release caller tensor's original data_ptr. It is wrong here because
@@ -465,19 +394,6 @@ const at::Tensor& shallowUpgradeToDPCPPTensorAW(const at::Tensor& ipexTensor, co
 }
 
 
-std::vector<at::Tensor> fallbackToCPUTensorList(const at::TensorList& tensor_list) {
-  std::vector<at::Tensor> dpcpp_tensor_vec(tensor_list.size());
-  for (size_t i = 0; i < tensor_list.size(); ++i) {
-    const at::Tensor& tensor = tensor_list[i];
-    TORCH_INTERNAL_ASSERT(tensor.defined());
-    if (tensor.defined()) {
-      dpcpp_tensor_vec[i] = fallbackToCPUTensor(tensor);
-    }
-  }
-  return dpcpp_tensor_vec;
-}
-
-
 std::vector<at::Tensor> shallowFallbackToCPUTensorList(const at::TensorList& tensor_list) {
   std::vector<at::Tensor> dpcpp_tensor_vec(tensor_list.size());
   for (size_t i = 0; i < tensor_list.size(); ++i) {
@@ -526,8 +442,6 @@ void reorderTensorToScalaraType(const at::Tensor& ipexTensor, at::ScalarType dst
 
   if (!check_tensor_own_whole_storage(ipexTensor)) {
     return;
-  } else {
-    TORCH_INTERNAL_ASSERT(false);
   }
 
   if (check_tensor_own_shade_context(ipexTensor)) {
@@ -558,26 +472,13 @@ void reorderTensorToScalaraType(const at::Tensor& ipexTensor, at::ScalarType dst
   ipexTensor.unsafeGetTensorImpl()->set_storage(storage_impl);
 }
 
-std::vector<at::Tensor> upgradeToDPCPPTensorVec(const std::vector<at::Tensor> &tensor_vec) {
-  std::vector<at::Tensor> ret_dpcpp_tensor_vec;
-  for (size_t i = 0; i < tensor_vec.size(); i++) {
-    auto&& cur_tensor = tensor_vec[i];
-    TORCH_INTERNAL_ASSERT(cur_tensor.defined());
-    TORCH_INTERNAL_ASSERT(cur_tensor.layout() == c10::kStrided);
-    TORCH_INTERNAL_ASSERT(cur_tensor.is_contiguous());
-    auto&& cur_dpcpp_tensor = upgradeToDPCPPTensor(cur_tensor);
-    ret_dpcpp_tensor_vec.push_back(cur_dpcpp_tensor);
-  }
-  return ret_dpcpp_tensor_vec;
-}
-
 
 std::vector<at::Tensor> shallowUpgradeToDPCPPTensorVec(const std::vector<at::Tensor> &tensor_vec) {
   std::vector<at::Tensor> ret_dpcpp_tensor_vec;
   for (size_t i = 0; i < tensor_vec.size(); i++) {
     auto&& cur_tensor = tensor_vec[i];
-    TORCH_INTERNAL_ASSERT(cur_tensor.defined());
-    TORCH_INTERNAL_ASSERT(cur_tensor.layout() == c10::kStrided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cur_tensor.defined());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cur_tensor.layout() == c10::kStrided);
     auto&& cur_dpcpp_tensor = shallowUpgradeToDPCPPTensor(cur_tensor);
     ret_dpcpp_tensor_vec.push_back(cur_dpcpp_tensor);
   }
diff --git a/torch_ipex/csrc/aten_ipex_bridge.h b/torch_ipex/csrc/aten_ipex_bridge.h
index 02ce9822d..bed667198 100644
--- a/torch_ipex/csrc/aten_ipex_bridge.h
+++ b/torch_ipex/csrc/aten_ipex_bridge.h
@@ -10,9 +10,7 @@ namespace torch_ipex {
 namespace bridge {
 
 // Convert DPCPP tensor to CPU tensor
-at::Tensor fallbackToCPUTensor(const at::Tensor& ipexTensor);
 at::Tensor shallowFallbackToCPUTensor(const at::Tensor& ipexTensor);
-std::vector<at::Tensor> fallbackToCPUTensorList(const at::TensorList&);
 std::vector<at::Tensor> shallowFallbackToCPUTensorList(const at::TensorList&);
 
 void attachShadeDataConext(const at::Tensor& tensor);
@@ -51,9 +49,7 @@ void reorderTensorToScalarTypeForDNNL(const at::Tensor& ipexTensor, at::ScalarTy
 void reorderTensorToScalaraType(const at::Tensor& ipexTensor, at::ScalarType dstScalarType);
 
 // Convert CPU tensor to DPCPP tensor
-at::Tensor upgradeToDPCPPTensor(const at::Tensor& ipexTensor);
 at::Tensor shallowUpgradeToDPCPPTensor(const at::Tensor& ipexTensor);
-std::vector<at::Tensor> upgradeToDPCPPTensorVec(const std::vector<at::Tensor> &);
 std::vector<at::Tensor> shallowUpgradeToDPCPPTensorVec(const std::vector<at::Tensor> &);
 
 // The last character A means alias. This function is for aten alias
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
index ac7caaa39..6d8ca4dbc 100644
--- a/torch_ipex/csrc/cpu/DevOPs.cpp
+++ b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -28,10 +28,10 @@ namespace cpu {
 #define DEBUG(fmt)
 #endif
 
-#define CHECK_DNNL_OP_PRE_COND(tensor)                                    \
-  TORCH_INTERNAL_ASSERT(tensor.defined());                                \
-  TORCH_INTERNAL_ASSERT(tensor.is_contiguous());                          \
-  TORCH_INTERNAL_ASSERT(tensor.layout() == c10::kStrided)
+#define CHECK_DNNL_OP_PRE_COND(tensor)                                               \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.defined());                                \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.is_contiguous());                          \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.layout() == c10::kStrided)
 
 at::Tensor AtenIpexCPUDev::dil_convolution(
     const at::Tensor & input,
@@ -41,6 +41,7 @@ at::Tensor AtenIpexCPUDev::dil_convolution(
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
     int64_t groups) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
   DEBUG("AtenIpexCPUDev::dil_convolution\n");
   dil::tensor dil_input;
   dil::tensor dil_weight;
@@ -175,18 +176,18 @@ at::Tensor AtenIpexCPUDev::dil_convolution_overrideable(const at::Tensor & input
 
 at::Tensor AtenIpexCPUDev::mkldnn_convolution(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
   DEBUG("AtenIpexCPUDev::mkldnn_convolution\n");
-  TORCH_INTERNAL_ASSERT(self.defined());
-  TORCH_INTERNAL_ASSERT(weight.defined());
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(weight.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(!(bias.defined()) || (bias.defined() && bias.layout() == c10::kStrided));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weight.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weight.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!(bias.defined()) || (bias.defined() && bias.layout() == c10::kStrided));
   auto&& _ipex_self = bridge::shallowFallbackToCPUTensor(self);
   auto&& _ipex_weight = bridge::shallowFallbackToCPUTensor(weight);
   auto&& _ipex_bias = bridge::shallowFallbackToCPUTensor(bias);
   auto&& _ipex_result = at::mkldnn_convolution(_ipex_self.contiguous(), _ipex_weight.contiguous(), _ipex_bias.contiguous(), padding, stride, dilation, groups);
   static_cast<void>(_ipex_result); // Avoid warnings in case not used
-  TORCH_INTERNAL_ASSERT(_ipex_result.is_contiguous());
-  TORCH_INTERNAL_ASSERT(_ipex_result.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_ipex_result.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_ipex_result.layout() == c10::kStrided);
   return bridge::shallowUpgradeToDPCPPTensor(_ipex_result);
 }
 
@@ -210,12 +211,12 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> AtenIpexCPUDev::dil_convolution_bac
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> AtenIpexCPUDev::mkldnn_convolution_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, std::array<bool,3> output_mask) {
   DEBUG("AtenIpexCPUDev::mkldnn_convolution_backward\n");
-  TORCH_INTERNAL_ASSERT(self.defined());
-  TORCH_INTERNAL_ASSERT(grad_output.defined());
-  TORCH_INTERNAL_ASSERT(weight.defined());
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(grad_output.layout() == c10::kStrided);
-  TORCH_INTERNAL_ASSERT(weight.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_output.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weight.defined());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_output.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weight.layout() == c10::kStrided);
   auto&& _ipex_self = bridge::shallowFallbackToCPUTensor(self);
   auto&& _ipex_grad_output = bridge::shallowFallbackToCPUTensor(grad_output);
   auto&& _ipex_weight = bridge::shallowFallbackToCPUTensor(weight);
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.cpp b/torch_ipex/csrc/cpu/ExtendOPs.cpp
index 636bef071..69d08d1bb 100644
--- a/torch_ipex/csrc/cpu/ExtendOPs.cpp
+++ b/torch_ipex/csrc/cpu/ExtendOPs.cpp
@@ -13,19 +13,19 @@
 namespace torch_ipex {
 
 void AtenIpexTypeExt::packed_add_(at::Tensor & top_half, at::Tensor & bot_half, const at::Tensor & grad, float alpha) {
-  TORCH_INTERNAL_ASSERT(grad.scalar_type() == at::ScalarType::BFloat16);
-  TORCH_INTERNAL_ASSERT(top_half.scalar_type() == at::ScalarType::BFloat16);
-  TORCH_INTERNAL_ASSERT(bot_half.scalar_type() == at::ScalarType::BFloat16);
-  TORCH_INTERNAL_ASSERT(grad.device().type() == at::DeviceType::DPCPP);
-  TORCH_INTERNAL_ASSERT(top_half.device().type() == at::DeviceType::DPCPP);
-  TORCH_INTERNAL_ASSERT(bot_half.device().type() == at::DeviceType::DPCPP);
-  TORCH_INTERNAL_ASSERT(top_half.sizes() == bot_half.sizes());
-  TORCH_INTERNAL_ASSERT(top_half.is_contiguous());
-  TORCH_INTERNAL_ASSERT(bot_half.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad.scalar_type() == at::ScalarType::BFloat16);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half.scalar_type() == at::ScalarType::BFloat16);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(bot_half.scalar_type() == at::ScalarType::BFloat16);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(bot_half.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half.sizes() == bot_half.sizes());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(bot_half.is_contiguous());
 
   RECORD_FUNCTION("packed_add_", std::vector<c10::IValue>({top_half, bot_half, grad, alpha}), torch::autograd::Node::peek_at_next_sequence_nr());
   if (grad.is_sparse()) {
-    TORCH_INTERNAL_ASSERT(top_half.dim() == 2);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half.dim() == 2);
     auto sparse_nnz = grad._nnz();
     auto sparse_dim = grad.sparse_dim();
     auto values = grad._values();
@@ -34,14 +34,14 @@ void AtenIpexTypeExt::packed_add_(at::Tensor & top_half, at::Tensor & bot_half,
     auto feature_size = values.stride(0);
     auto indices_accessor = indices.accessor<int64_t, 2>();
 
-    TORCH_INTERNAL_ASSERT(values.is_contiguous());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.is_contiguous());
     auto value_ptr = values.data_ptr<at::BFloat16>();
     auto top_half_ptr = top_half.data_ptr<at::BFloat16>();
     auto bot_half_ptr = bot_half.data_ptr<at::BFloat16>();
 
-    TORCH_INTERNAL_ASSERT(value_ptr != nullptr);
-    TORCH_INTERNAL_ASSERT(top_half_ptr != nullptr);
-    TORCH_INTERNAL_ASSERT(bot_half_ptr != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(value_ptr != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(top_half_ptr != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(bot_half_ptr != nullptr);
 
     std::vector<int64_t> sparse_stride(sparse_dim);
     for (int64_t d = 0; d < sparse_dim; d++) {
@@ -80,7 +80,7 @@ void AtenIpexTypeExt::packed_add_(at::Tensor & top_half, at::Tensor & bot_half,
       }
     });
   } else {
-    TORCH_INTERNAL_ASSERT(grad.is_contiguous());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad.is_contiguous());
     //TODO: vector implementation basing on vector size
     union packed_bf16 {
       unsigned short s[2];
@@ -201,15 +201,15 @@ inline at::Tensor _interaction_forward(const std::vector<at::Tensor> & input) {
   std::vector<uint32_t> feature_sizes(input.size());
   std::vector<T *> input_data(input.size());
   for (int i = 0; i < input.size(); i++) {
-    TORCH_INTERNAL_ASSERT(input[i].is_contiguous());
-    TORCH_INTERNAL_ASSERT(input[i].device().is_dpcpp());
-    TORCH_INTERNAL_ASSERT(input[i].dim() == 2);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].is_contiguous());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].device().is_dpcpp());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].dim() == 2);
     feature_sizes[i] = input[i].sizes()[1];
     total_feature_size += input[i].sizes()[1];
     input_data[i] = input[i].data_ptr<T>();
   }
   auto vector_nums = total_feature_size / vector_size;
-  TORCH_INTERNAL_ASSERT(total_feature_size % vector_size == 0);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(total_feature_size % vector_size == 0);
   auto interact_feature_size = vector_nums * (vector_nums - 1) / 2;
   auto tr_vector_size = sizeof(T) == 4 ? vector_size : vector_size / 2;
   auto out = at::empty({batch_size, interact_feature_size + vector_size}, input[0].options());
@@ -239,7 +239,7 @@ inline at::Tensor _interaction_forward(const std::vector<at::Tensor> & input) {
 
 template<typename T>
 inline std::vector<at::Tensor> _interaction_backward(const at::Tensor & grad_out, const std::vector<at::Tensor> & input) {
-  TORCH_INTERNAL_ASSERT(grad_out.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_out.is_contiguous());
   RECORD_FUNCTION("_interaction_backward", std::vector<c10::IValue>({grad_out, input}), torch::autograd::Node::peek_at_next_sequence_nr());
   uint32_t total_feature_size = 0;
   int64_t batch_size = input[0].sizes()[0];
@@ -257,7 +257,7 @@ inline std::vector<at::Tensor> _interaction_backward(const at::Tensor & grad_out
     output_data[i] = output[i].data_ptr<T>();
   }
   auto vector_nums = total_feature_size / vector_size;
-  TORCH_INTERNAL_ASSERT(total_feature_size % vector_size == 0);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(total_feature_size % vector_size == 0);
   auto interact_feature_size = vector_nums * (vector_nums - 1) / 2;
   auto grad_out_data = grad_out.data_ptr<T>();
 
@@ -305,11 +305,11 @@ inline std::vector<at::Tensor> _interaction_backward(const at::Tensor & grad_out
 
 at::Tensor AtenIpexTypeExt::interaction_forward(const std::vector<at::Tensor> & input) {
   if (input[0].scalar_type() == at::kFloat) {
-    for (const auto &in : input) { TORCH_INTERNAL_ASSERT(in.scalar_type() == at::kFloat); }
+    for (const auto &in : input) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(in.scalar_type() == at::kFloat); }
     return _interaction_forward<float>(input);
   } else {
-    TORCH_INTERNAL_ASSERT(input[0].scalar_type() == at::kBFloat16);
-    for (const auto &in : input) { TORCH_INTERNAL_ASSERT(in.scalar_type() == at::kBFloat16); }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[0].scalar_type() == at::kBFloat16);
+    for (const auto &in : input) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(in.scalar_type() == at::kBFloat16); }
     return _interaction_forward<at::BFloat16>(input);
   }
 }
@@ -318,18 +318,18 @@ std::vector<at::Tensor> AtenIpexTypeExt::interaction_backward(const at::Tensor &
   if (grad_out.scalar_type() == at::kFloat) {
     return _interaction_backward<float>(grad_out, input);
   } else {
-    TORCH_INTERNAL_ASSERT(grad_out.scalar_type() == at::kBFloat16);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_out.scalar_type() == at::kBFloat16);
     return _interaction_backward<at::BFloat16>(grad_out, input);
   }
 }
 
 template<typename T>
 static inline at::Tensor _embedding_bag_forward(const at::Tensor &weights, const at::Tensor &inputs, const at::Tensor &offsets) {
-  TORCH_INTERNAL_ASSERT(weights.is_contiguous());
-  TORCH_INTERNAL_ASSERT(inputs.is_contiguous());
-  TORCH_INTERNAL_ASSERT(offsets.is_contiguous());
-  TORCH_INTERNAL_ASSERT(inputs.dim() == 1);
-  TORCH_INTERNAL_ASSERT(weights.dim() == 2);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weights.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(offsets.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs.dim() == 1);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weights.dim() == 2);
   RECORD_FUNCTION("_embedding_bag_forward", std::vector<c10::IValue>({weights, inputs, offsets}), torch::autograd::Node::peek_at_next_sequence_nr());
   auto batch_size = offsets.size(0);
   auto num_input = inputs.size(0);
@@ -345,7 +345,7 @@ static inline at::Tensor _embedding_bag_forward(const at::Tensor &weights, const
       auto inputs_start = offsets_data[i];
       auto inputs_end = (i < batch_size - 1) ? offsets_data[i + 1] : num_input;
       // TODO: add acc_t support for bag size larger than 1
-      TORCH_INTERNAL_ASSERT(inputs_end - inputs_start == 1);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs_end - inputs_start == 1);
       auto out_data_ptr = &output_data[i * vector_size];
       #pragma omp simd
       for (int64_t v = 0; v < vector_size; v++) out_data_ptr[v] = 0.0;
@@ -361,8 +361,8 @@ static inline at::Tensor _embedding_bag_forward(const at::Tensor &weights, const
 template<typename T>
 static inline at::Tensor _embedding_bag_backward(const at::Tensor &grad_out,
     const at::Tensor &weights, const at::Tensor &inputs, const at::Tensor offsets) {
-  TORCH_INTERNAL_ASSERT(inputs.dim() == 1);
-  TORCH_INTERNAL_ASSERT(grad_out.dim() == 2);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs.dim() == 1);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_out.dim() == 2);
   RECORD_FUNCTION("_embedding_bag_backward", std::vector<c10::IValue>({grad_out, weights, inputs, offsets}), torch::autograd::Node::peek_at_next_sequence_nr());
   auto batch_size = offsets.size(0);
   auto num_input = inputs.size(0);
@@ -408,7 +408,7 @@ at::Tensor AtenIpexTypeExt::embedding_bag_forward(const at::Tensor &weights, con
   if (weights.scalar_type() == at::kFloat) {
     return _embedding_bag_forward<float>(weights, inputs, offsets);
   } else {
-    TORCH_INTERNAL_ASSERT(weights.scalar_type() == at::kBFloat16);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(weights.scalar_type() == at::kBFloat16);
     return _embedding_bag_forward<at::BFloat16>(weights, inputs, offsets);
   }
 }
@@ -418,7 +418,7 @@ at::Tensor AtenIpexTypeExt::embedding_bag_backward(const at::Tensor &grad_out,
   if (grad_out.scalar_type() == at::kFloat) {
     return _embedding_bag_backward<float>(grad_out, weights, inputs, offsets);
   } else {
-    TORCH_INTERNAL_ASSERT(grad_out.scalar_type() == at::kBFloat16);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grad_out.scalar_type() == at::kBFloat16);
     return _embedding_bag_backward<at::BFloat16>(grad_out, weights, inputs, offsets);
   }
 }
diff --git a/torch_ipex/csrc/cpu/MlpOPs.cpp b/torch_ipex/csrc/cpu/MlpOPs.cpp
index 017a6c5ec..4d3137955 100644
--- a/torch_ipex/csrc/cpu/MlpOPs.cpp
+++ b/torch_ipex/csrc/cpu/MlpOPs.cpp
@@ -93,7 +93,7 @@ std::vector<at::Tensor> AtenIpexTypeMLPExt::backward(
     const at::Tensor &grad_output,
     const at::Tensor &input,
     const at::Tensor &weight) {
-  TORCH_INTERNAL_ASSERT(libxsmm_handle_ != nullptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(libxsmm_handle_ != nullptr);
   libxsmm_dnn_err_t global_status;
   auto nbn = input.size(0);
   auto nbc = input.size(1);
@@ -173,12 +173,12 @@ void *AtenIpexTypeMLPExt::create_handle(int N, int C, int K, int bn, int bc, int
 }
 
 at::Tensor AtenIpexTypeMLPExt::set_relu_mask(void *libxsmm_handle_) {
-  TORCH_INTERNAL_ASSERT(libxsmm_handle_ != nullptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(libxsmm_handle_ != nullptr);
   libxsmm_dnn_fullyconnected *handle = (libxsmm_dnn_fullyconnected*)libxsmm_handle_;
   libxsmm_dnn_err_t status;
   libxsmm_dnn_err_t global_status;
   libxsmm_dnn_tensor_datalayout* layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(handle, LIBXSMM_DNN_RELU_MASK, &status); CHKERR_LIBXSMM_DNN( status );
-  TORCH_INTERNAL_ASSERT(layout != nullptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout != nullptr);
   std::vector<long> dim_size;
   for (int i = layout->num_dims - 1; i >= 0; i--) {
     dim_size.push_back(layout->dim_size[i]);
@@ -192,7 +192,7 @@ at::Tensor AtenIpexTypeMLPExt::set_relu_mask(void *libxsmm_handle_) {
 }
 
 void AtenIpexTypeMLPExt::release_handle(void* libxsmm_handle_) {
-  TORCH_INTERNAL_ASSERT(libxsmm_handle_ != nullptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(libxsmm_handle_ != nullptr);
   libxsmm_dnn_err_t global_status;
   libxsmm_dnn_err_t status;
   libxsmm_dnn_fullyconnected* libxsmm_handle = (libxsmm_dnn_fullyconnected*)libxsmm_handle_;
diff --git a/torch_ipex/csrc/cpu/ShadeDataContext.h b/torch_ipex/csrc/cpu/ShadeDataContext.h
index ace9ce5dd..37216a83e 100644
--- a/torch_ipex/csrc/cpu/ShadeDataContext.h
+++ b/torch_ipex/csrc/cpu/ShadeDataContext.h
@@ -27,17 +27,17 @@ struct ShadeDataContext {
   ~ShadeDataContext() {
     if (this->data_type == SHADE_DATA_TYPE::DIL) { // DIL Tensor
       if (this->dil_tensor.is_public_format()) {
-        TORCH_INTERNAL_ASSERT(this->cpu_raw_data != nullptr);
-        TORCH_INTERNAL_ASSERT(this->dil_tensor.get_data_handle() == this->cpu_raw_data);
-        TORCH_INTERNAL_ASSERT(this->cpu_del_fun == &(c10::detail::deleteNothing));
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_raw_data != nullptr);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->dil_tensor.get_data_handle() == this->cpu_raw_data);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_del_fun == &(c10::detail::deleteNothing));
       } else {
         // If dil tensor is block format, the cpu raw data means nothing here.
-        TORCH_INTERNAL_ASSERT(this->cpu_raw_data == nullptr);
-        TORCH_INTERNAL_ASSERT(this->cpu_del_fun == nullptr);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_raw_data == nullptr);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_del_fun == nullptr);
       }
     } else { // CPU Tensor here
-      TORCH_INTERNAL_ASSERT(this->cpu_del_fun != nullptr);
-      TORCH_INTERNAL_ASSERT(this->cpu_del_fun != &(c10::detail::deleteNothing));
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_del_fun != nullptr);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this->cpu_del_fun != &(c10::detail::deleteNothing));
       this->cpu_del_fun(this->cpu_raw_data);
       this->cpu_raw_data = nullptr;
     }
@@ -49,10 +49,10 @@ struct ShadeDataContext {
    * @param raw_data Raw pointer of @class ShadeDataContext
    */
   static void freeShadeDataContext(void *raw_data) {
-    TORCH_INTERNAL_ASSERT(raw_data != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(raw_data != nullptr);
     ShadeDataContext *shade_data_ctx = (ShadeDataContext*)raw_data;
     auto data_type = shade_data_ctx->data_type;
-    TORCH_INTERNAL_ASSERT((data_type == SHADE_DATA_TYPE::CPU_RAW) || (data_type == SHADE_DATA_TYPE::DIL));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY((data_type == SHADE_DATA_TYPE::CPU_RAW) || (data_type == SHADE_DATA_TYPE::DIL));
     delete shade_data_ctx;
   }
 
@@ -74,11 +74,11 @@ struct ShadeDataContext {
    * only contains DNNL buffer, it obiviouly is DNNL tensor
    */
   static inline bool isDilTensor(const at::Tensor &tensor) {
-    TORCH_INTERNAL_ASSERT(tensor.has_storage());
-    TORCH_INTERNAL_ASSERT(tensor.layout() == c10::Layout::Strided);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.has_storage());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.layout() == c10::Layout::Strided);
 
     if (tensor.device().type() != c10::DeviceType::DPCPP) {
-      TORCH_INTERNAL_ASSERT(tensor.device().type() == c10::DeviceType::CPU);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.device().type() == c10::DeviceType::CPU);
       return false;
     }
 
@@ -87,17 +87,17 @@ struct ShadeDataContext {
     void *storage_context = tensor.storage().data_ptr().get_context();
     ShadeDataContext *shade_data_context = (ShadeDataContext*)storage_context;
     auto data_type = shade_data_context->data_type;
-    TORCH_INTERNAL_ASSERT((data_type == SHADE_DATA_TYPE::CPU_RAW) || (data_type == SHADE_DATA_TYPE::DIL));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY((data_type == SHADE_DATA_TYPE::CPU_RAW) || (data_type == SHADE_DATA_TYPE::DIL));
 
     if (data_type == SHADE_DATA_TYPE::DIL) {
       auto raw_cpu_data = tensor.storage().data_ptr().get();
       if (raw_cpu_data == nullptr) {
         // the dnnl tensor does not share data with raw tensor data.
-        TORCH_INTERNAL_ASSERT(! (shade_data_context->dil_tensor.is_empty()));
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! (shade_data_context->dil_tensor.is_empty()));
         return true;
       } else {
         // The dnnl tensor shares some data with raw tensor.
-        TORCH_INTERNAL_ASSERT(shade_data_context->dil_tensor.is_public_format());
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->dil_tensor.is_public_format());
 
         // For the case:
         //   1. There is a tensor named A
@@ -111,7 +111,7 @@ struct ShadeDataContext {
         // All these tensors share same buffer of Tensor A with different storge offsets and elements.
         // So the context modification will impact all these tensors.
         if (check_tensor_own_whole_storage(tensor)) {
-          TORCH_INTERNAL_ASSERT(shade_data_context->dil_tensor.get_size() == tensor.storage().capacity());
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context->dil_tensor.get_size() == tensor.storage().capacity());
           return true;
         }
       }
@@ -138,10 +138,10 @@ struct ShadeDataContext {
    * an empty DNNL buffer. The caller should check the return buffer is empty or not.
    */
   static inline dil::tensor getDilTensor(const at::Tensor &tensor) {
-    TORCH_INTERNAL_ASSERT(tensor.has_storage());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.has_storage());
     void *raw_context = tensor.storage().data_ptr().get_context();
-    TORCH_INTERNAL_ASSERT(raw_context != nullptr);
-    TORCH_INTERNAL_ASSERT(isDilTensor(tensor));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(raw_context != nullptr);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isDilTensor(tensor));
     ShadeDataContext *shade_data_context = (ShadeDataContext*)raw_context;
     return shade_data_context->dil_tensor;
   }
@@ -155,15 +155,15 @@ struct ShadeDataContext {
    * and return it to the caller. Otherwise, the function will return nullptr
    */
   static inline void * getCpuRawData(const at::Tensor &tensor) {
-    TORCH_INTERNAL_ASSERT(tensor.has_storage());
-    TORCH_INTERNAL_ASSERT(tensor.unsafeGetTensorImpl()->unique_version());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.has_storage());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.unsafeGetTensorImpl()->unique_version());
     if (isCpuTensor(tensor)) {
       auto& data_ptr = tensor.storage().data_ptr();
       ShadeDataContext *shade_data_context = (ShadeDataContext*)(data_ptr.get_context());
-      TORCH_INTERNAL_ASSERT(shade_data_context != nullptr);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(shade_data_context != nullptr);
       return shade_data_context->cpu_raw_data;
     } else {
-      TORCH_INTERNAL_ASSERT(false);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
       return nullptr;
     }
   }
diff --git a/torch_ipex/csrc/cpu/SparseAttr.cpp b/torch_ipex/csrc/cpu/SparseAttr.cpp
index d7d92a2c6..8dab5ce5c 100644
--- a/torch_ipex/csrc/cpu/SparseAttr.cpp
+++ b/torch_ipex/csrc/cpu/SparseAttr.cpp
@@ -5,59 +5,59 @@ namespace torch_ipex {
 namespace cpu {
 
 int64_t AtenIpexCPUSparse::sparse_dim(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->sparse_dim();
 }
 
 int64_t AtenIpexCPUSparse::dense_dim(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->dense_dim();
 }
 
 int64_t AtenIpexCPUSparse::_dimI(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->sparse_dim();
 }
 
 int64_t AtenIpexCPUSparse::_dimV(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->dense_dim();
 }
 
 int64_t AtenIpexCPUSparse::_nnz(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->nnz();
 }
 
 bool AtenIpexCPUSparse::is_coalesced(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->coalesced();
 }
 
 at::Tensor & AtenIpexCPUSparse::_coalesced_(at::Tensor & self, bool coalesced) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->set_coalesced(coalesced);
   return self;
 }
 
 at::Tensor AtenIpexCPUSparse::_indices(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->indices();
 }
 
 at::Tensor AtenIpexCPUSparse::_values(const at::Tensor & self) {
-  TORCH_INTERNAL_ASSERT(self.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.layout() == c10::kSparse);
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->values();
 }
 
 at::Tensor AtenIpexCPUSparse::indices(const at::Tensor& self) {
-  TORCH_INTERNAL_ASSERT(self.is_coalesced(),
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_coalesced(),
            "Cannot get indices on an uncoalesced tensor, please call .coalesce() first");
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->indices().alias();
 }
 
 at::Tensor AtenIpexCPUSparse::values(const at::Tensor& self) {
-  TORCH_INTERNAL_ASSERT(self.is_coalesced(),
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_coalesced(),
            "Cannot get values on an uncoalesced tensor, please call .coalesce() first");
   return IPEXSparseTensorImpl::get_ipex_sparse_impl(self)->values().alias();
 }
diff --git a/torch_ipex/csrc/cpu/dbl/Common.cpp b/torch_ipex/csrc/cpu/dbl/Common.cpp
index bf338402d..7432d3a9e 100644
--- a/torch_ipex/csrc/cpu/dbl/Common.cpp
+++ b/torch_ipex/csrc/cpu/dbl/Common.cpp
@@ -24,8 +24,8 @@ dil::tensor dil_tensor_from_dense(const at::Tensor& tensor) {
 }
 
 at::Tensor dil_tensor_to_dense(const at::Tensor& tensor) {
-  TORCH_INTERNAL_ASSERT(cpu::ShadeDataContext::isDilTensor(tensor));
-  TORCH_INTERNAL_ASSERT(tensor.unsafeGetTensorImpl()->version_counter().current_version() == 1);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(cpu::ShadeDataContext::isDilTensor(tensor));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.unsafeGetTensorImpl()->version_counter().current_version() == 1);
   auto dil_tensor = cpu::ShadeDataContext::getDilTensor(tensor);
   at::Tensor cpu_tensor = at::empty(
     dil_tensor.get_dims(),
@@ -76,10 +76,10 @@ at::Tensor gen_aten_tensor_by(dil::tensor dil_tensor) {
   } else {
     // Blockformat does not inlcude stride information
     auto tensor_sizes = dil_tensor.get_dims();
-    TORCH_INTERNAL_ASSERT(tensor_sizes.size() != 1 || tensor_sizes[0] != 0);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor_sizes.size() != 1 || tensor_sizes[0] != 0);
     _tensor.unsafeGetTensorImpl()->set_sizes_contiguous(tensor_sizes);
   }
-  TORCH_INTERNAL_ASSERT(_tensor.layout() == c10::kStrided);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_tensor.layout() == c10::kStrided);
   return _tensor;
 }
 
@@ -95,7 +95,7 @@ at::Tensor empty_dil_tensor(at::IntArrayRef sizes, const at::TensorOptions& opti
 void sync_shape_from_dil_to_aten(const at::Tensor& ipex_tensor, const dil::tensor &dil_tensor) {
   dil::dims sizes = dil_tensor.get_dims();
   dil::dims strides = dil_tensor.get_strides();
-  TORCH_INTERNAL_ASSERT(ipex_tensor.device().type() == at::DeviceType::DPCPP);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.device().type() == at::DeviceType::DPCPP);
   auto* _tensor_impl = (IPEXTensorImpl *)ipex_tensor.unsafeGetTensorImpl();
   _tensor_impl->force_set_strided(sizes, strides);
 }
diff --git a/torch_ipex/csrc/ipex_sparse_tensor_impl.cpp b/torch_ipex/csrc/ipex_sparse_tensor_impl.cpp
index 6c2c8fdc0..6efd2443c 100644
--- a/torch_ipex/csrc/ipex_sparse_tensor_impl.cpp
+++ b/torch_ipex/csrc/ipex_sparse_tensor_impl.cpp
@@ -7,7 +7,7 @@ IPEXSparseTensorImpl::IPEXSparseTensorImpl(at::DispatchKeySet type_set, const ca
 }
 
 IPEXSparseTensorImpl * IPEXSparseTensorImpl::get_ipex_sparse_impl(const at::Tensor& ipex_tensor) {
-  TORCH_INTERNAL_ASSERT(ipex_tensor.layout() == c10::kSparse);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ipex_tensor.layout() == c10::kSparse);
   // TORCH_INTERNAL_ASSERT(ipex_tensor.device().type() == at::DeviceType::DPCPP);
   return static_cast<IPEXSparseTensorImpl*>(ipex_tensor.unsafeGetTensorImpl());
 }
diff --git a/torch_ipex/csrc/ipex_tensor_impl.cpp b/torch_ipex/csrc/ipex_tensor_impl.cpp
index 851693956..934674f8c 100644
--- a/torch_ipex/csrc/ipex_tensor_impl.cpp
+++ b/torch_ipex/csrc/ipex_tensor_impl.cpp
@@ -70,7 +70,7 @@ void IPEXTensorImpl::reset_data_type(at::ScalarType dst_type) {
 
 void IPEXTensorImpl::copy_auto_grad(c10::TensorImpl *src_impl) {
   if (! src_impl->requires_grad()) {
-    TORCH_INTERNAL_ASSERT(! this->requires_grad());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! this->requires_grad());
     return;
   }
 
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
index 539aba0aa..62ca9c86c 100644
--- a/torch_ipex/csrc/jit/fusion_pass.cpp
+++ b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -262,14 +262,14 @@ class OpFuser {
     // Y-merge like case
     //
     if (safe && node->inputs().size() > 1) {
-      TORCH_INTERNAL_ASSERT(r);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(r);
       auto rule = *r.value();
       auto& schema = matchSchemaForFusion(rule.second, v->node(), node);
       auto o_schema = node->schema();
 
       auto pos = v->node()->inputs().size();
 
-      TORCH_INTERNAL_ASSERT(schema.arguments().size()
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(schema.arguments().size()
           == pos + node->inputs().size() -1);
 
       for (int i = 0; i < node->inputs().size(); ++ i) {