enable the fallback to cpu for adaptive_avg_pool2d and max_pool2d (#84)

chunyuan-w · web-flow · commit 79953a5d0218 · 2020-06-22T09:56:04.000+08:00
1. move "bn folding" and "prepack conv weight" to the hooked jit script function 
2. add check on fused node in the graph
diff --git a/tests/cpu/test_lazy_reorder.py b/tests/cpu/test_lazy_reorder.py
@@ -641,6 +641,48 @@ def test_adaptive_avg_pool2d_backward(self):
         y_cpu.backward()
         y_dpcpp.backward()
         self.assertEqual(x_cpu.grad, x_dpcpp.grad)
+    
+    def test_adaptive_avg_pool2d_not_divisible(self):
+        ipex.enable_auto_dnnl()
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        N = torch.randint(3, 10, (1,)).item()
+        C = torch.randint(3, 10, (1,)).item()
+        x_cpu = torch.randn(N, C, 224, 224, dtype=torch.float32) * 100
+        x_dpcpp = x_cpu.to(device=device)
+        # test the fallback to cpu when the input size is not divisible by the output size
+        adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(6)
+
+        y_cpu = adaptive_avg_pool2d(x_cpu)
+        y_dpcpp = adaptive_avg_pool2d(x_dpcpp)
+
+        self.assertEqual(
+            y_cpu,
+            y_dpcpp)
+
+        self.assertEqual(device, y_dpcpp.device.type)
+
+    def test_adaptive_avg_pool2d_backward_not_divisible(self):
+        ipex.enable_auto_dnnl()
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        x = torch.randn(10, 3, 224, 224, dtype=torch.float32) * 100
+
+        x_cpu = x.clone().requires_grad_()
+        x_dpcpp = x.clone().to(device=device).requires_grad_()
+        # test the fallback to cpu when the input size is not divisible by the output size
+        adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(6)
+
+        y_cpu = adaptive_avg_pool2d(x_cpu).sum()
+        y_dpcpp = adaptive_avg_pool2d(x_dpcpp).sum()
+        y_cpu.backward()
+        y_dpcpp.backward()
+        self.assertEqual(x_cpu.grad, x_dpcpp.grad)
+
+        self.assertEqual(device, x_dpcpp.grad.device.type)
+        self.assertEqual(device, y_dpcpp.device.type)
 
     def test_max_pool2d(self):
         ipex.core.enable_auto_dnnl()
@@ -663,6 +705,33 @@ def test_max_pool2d(self):
                         ceil_mode=ceil_mode)
 
                     self.assertEqual(max_pool2d(x_cpu), max_pool2d(x_dpcpp))
+    
+    def test_max_pool2d_double(self):
+        ipex.enable_auto_dnnl()
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        N = torch.randint(3, 10, (1,)).item()
+        C = torch.randint(3, 10, (1,)).item()
+
+        for stride in [1, 2, 3]:
+            for H, W in [(64, 64), (35, 39), (16, 19), [7, 8]]:
+                # test the fallback to cpu when the input is double
+                x_cpu = torch.randn(N, C, H, W, dtype=torch.double) * 10
+                x_dpcpp = x_cpu.to(device=device)
+
+                for ceil_mode in [False, True]:
+                    max_pool2d = torch.nn.MaxPool2d(
+                        kernel_size=3 if not ceil_mode else 7,
+                        stride=stride,
+                        padding=1,
+                        ceil_mode=ceil_mode)
+                    
+                    y_cpu = max_pool2d(x_cpu)
+                    y_dpcpp = max_pool2d(x_dpcpp)
+                    self.assertEqual(y_cpu, y_dpcpp)
+
+                    self.assertEqual(device, y_dpcpp.device.type)
 
     def test_max_pool3d(self):
         ipex.core.enable_auto_dnnl()
@@ -707,6 +776,32 @@ def test_max_pool2d_backward(self):
             y1.backward()
             y2.backward()
             self.assertEqual(x1.grad, x2.grad)
+    
+    def test_max_pool2d_backward_double(self):
+        ipex.enable_auto_dnnl()
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        # test the fallback to cpu when the input is double
+        x = torch.randn(10, 3, 64, 64, dtype=torch.double) * 10
+        for ceil_mode in [True]:
+            max_pool2d = torch.nn.MaxPool2d(
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                ceil_mode=ceil_mode)
+
+            x1 = x.clone().requires_grad_()
+            x2 = x.clone().to(device=device).requires_grad_()
+
+            y1 = max_pool2d(x1).sum()
+            y2 = max_pool2d(x2).sum()
+            y1.backward()
+            y2.backward()
+            self.assertEqual(x1.grad, x2.grad)
+
+            self.assertEqual(device, x2.grad.device.type)
+            self.assertEqual(device, y2.device.type)
 
     def test_max_pool3d_backward(self):
         ipex.core.enable_auto_dnnl()
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -6,6 +6,7 @@
 #include <ATen/Tensor.h>
 #include <torch/script.h>
 #include <c10/util/Optional.h>
+#include "torch_ipex/csrc/aten_ipex_bridge.h"
 #include "torch_ipex/csrc/utils.h"
 #include "DevOPs.h"
 
@@ -68,17 +69,29 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
         ctx->saved_data["dilation"] = dilation;
         ctx->saved_data["ceil_mode"] = ceil_mode;
 
-        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
-          at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input.is_contiguous() ? input : input.contiguous(), kernel_size, stride,
-              padding, dilation, ceil_mode);
-          ctx->save_for_backward({input, output});
-          return output;
+        try {
+          if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+            at::Tensor output = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling(input.is_contiguous() ? input : input.contiguous(), kernel_size, stride,
+                padding, dilation, ceil_mode);
+            ctx->save_for_backward({input, output});
+            return output;
+          }
+        } catch (std::exception& e) {
+#if defined(_DEBUG)
+          TORCH_WARN(e.what());
+#endif
+        }
+        at::Tensor output, indices;
+        if (input.device().type() == c10::DeviceType::DPCPP) {
+          auto&& _ipex_input = torch_ipex::bridge::shallowFallbackToCPUTensor(input);
+          auto&& _ipex_result = at::max_pool2d_with_indices(_ipex_input, kernel_size, stride, padding, dilation, ceil_mode);
+          static_cast<void>(_ipex_result);
+          std::tie(output, indices) = std::tuple<at::Tensor,at::Tensor>(torch_ipex::bridge::shallowUpgradeToDPCPPTensor(std::get<0>(_ipex_result)), torch_ipex::bridge::shallowUpgradeToDPCPPTensor(std::get<1>(_ipex_result)));
         } else {
-          at::Tensor output, indices;
           std::tie(output, indices) = at::max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode);
-          ctx->save_for_backward({input, indices});
-          return output;
         }
+        ctx->save_for_backward({input, indices});
+        return output;
       }
 
     static torch::autograd::tensor_list backward(
@@ -97,9 +110,26 @@ class NewMaxPoolingOp : public torch::autograd::Function<NewMaxPoolingOp> {
       std::vector<int64_t> dilation = ctx->saved_data["dilation"].toIntVector();
       bool ceil_mode = ctx->saved_data["ceil_mode"].toBool();
 
-      if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
-        grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling_backward(
-            grad_output.is_contiguous() ? grad_output : grad_output.contiguous(), indices.is_contiguous() ? indices : indices.contiguous(), input.is_contiguous() ? input : input.contiguous(), kernel_size, stride, padding, dilation, ceil_mode);
+      
+      try {
+        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+          grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_max_pooling_backward(
+              grad_output.is_contiguous() ? grad_output : grad_output.contiguous(), indices.is_contiguous() ? indices : indices.contiguous(), input.is_contiguous() ? input : input.contiguous(), kernel_size, stride, padding, dilation, ceil_mode);
+          return {grad_input, at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()};
+        }
+      } catch (std::exception& e) {
+#if defined(_DEBUG)
+        TORCH_WARN(e.what());
+#endif
+      }
+      if (input.device().type() == c10::DeviceType::DPCPP) {
+        auto&& _ipex_grad_output = torch_ipex::bridge::shallowFallbackToCPUTensor(grad_output);
+        auto&& _ipex_input = torch_ipex::bridge::shallowFallbackToCPUTensor(input);
+        auto&& _ipex_indices = torch_ipex::bridge::shallowFallbackToCPUTensor(indices);
+        auto&& _ipex_grad_input = at::max_pool2d_with_indices_backward(_ipex_grad_output, _ipex_input, kernel_size,
+            stride, padding, dilation, ceil_mode, _ipex_indices);
+        static_cast<void>(_ipex_grad_input);
+        grad_input = torch_ipex::bridge::shallowUpgradeToDPCPPTensor(_ipex_grad_input);
       } else {
         grad_input = at::max_pool2d_with_indices_backward(grad_output, input, kernel_size,
             stride, padding, dilation, ceil_mode, indices);
@@ -116,13 +146,23 @@ class NewApaptiveAvgPoolingOp : public torch::autograd::Function<NewApaptiveAvgP
         at::IntArrayRef output_size) {
         ctx->save_for_backward({input});
 
-        at::Tensor output;
-        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
-          output = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input.is_contiguous() ? input : input.contiguous(), output_size);
+        try{
+          if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+            return torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d(input.is_contiguous() ? input : input.contiguous(), output_size);
+          } 
+        } catch (std::exception& e) {
+#if defined(_DEBUG)
+          TORCH_WARN(e.what());
+#endif
+        }
+        if (input.device().type() == c10::DeviceType::DPCPP) {
+          auto&& _ipex_input = torch_ipex::bridge::shallowFallbackToCPUTensor(input);
+          auto&& _ipex_result = at::_adaptive_avg_pool2d(_ipex_input, output_size);
+          static_cast<void>(_ipex_result); // Avoid warnings in case not used
+          return torch_ipex::bridge::shallowUpgradeToDPCPPTensor(_ipex_result);
         } else {
-          output = at::_adaptive_avg_pool2d(input, output_size);
+          return at::_adaptive_avg_pool2d(input, output_size);
         }
-        return output;
       }
 
     static torch::autograd::tensor_list backward(
@@ -134,8 +174,22 @@ class NewApaptiveAvgPoolingOp : public torch::autograd::Function<NewApaptiveAvgP
       at::Tensor grad_output = grad_outputs[0];
       at::Tensor grad_input;
 
-      if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
-        grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d_backward(grad_output.is_contiguous() ? grad_output : grad_output.contiguous(), input.is_contiguous() ? input : input.contiguous());
+      try {
+        if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
+          grad_input = torch_ipex::cpu::AtenIpexCPUDev::dil_adaptive_avg_pool2d_backward(grad_output.is_contiguous() ? grad_output : grad_output.contiguous(), input.is_contiguous() ? input : input.contiguous());
+          return {grad_input, at::Tensor()};
+        }
+      } catch (std::exception& e) {
+#if defined(_DEBUG)
+        TORCH_WARN(e.what());
+#endif
+      }
+      if (input.device().type() == c10::DeviceType::DPCPP) {
+        auto&& _ipex_grad_output = torch_ipex::bridge::shallowFallbackToCPUTensor(grad_output);
+        auto&& _ipex_input = torch_ipex::bridge::shallowFallbackToCPUTensor(input);
+        auto&& _ipex_result = at::_adaptive_avg_pool2d_backward(_ipex_grad_output, _ipex_input);
+        static_cast<void>(_ipex_result); // Avoid warnings in case not used
+        grad_input = torch_ipex::bridge::shallowUpgradeToDPCPPTensor(_ipex_result);
       } else {
         grad_input = at::_adaptive_avg_pool2d_backward(grad_output, input);
       }