intel
diff --git a/‎intel_pytorch_extension_py/ops/nms.py
Lines changed: 2 additions & 1 deletion b/‎intel_pytorch_extension_py/ops/nms.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/cpu/data/nms_plabel.pt
4.68 MB b/‎tests/cpu/data/nms_plabel.pt
4.68 MB
diff --git a/‎tests/cpu/data/nms_ploc.pt
237 KB b/‎tests/cpu/data/nms_ploc.pt
237 KB
diff --git a/‎tests/cpu/test_nms.py
Lines changed: 46 additions & 6 deletions b/‎tests/cpu/test_nms.py
Lines changed: 46 additions & 6 deletions
diff --git a/‎torch_ipex/csrc/cpu/ExtendOPs.h
Lines changed: 12 additions & 3 deletions b/‎torch_ipex/csrc/cpu/ExtendOPs.h
Lines changed: 12 additions & 3 deletions
@@ -1,5 +1,6 @@
 import torch
 
-nms = torch.ops.torch_ipex.nms
+def nms(dets, scores, threshold, sorted=False):
+    return torch.ops.torch_ipex.nms(dets, scores, threshold, sorted)
 batch_score_nms = torch.ops.torch_ipex.batch_score_nms
 parallel_scale_back_batch = torch.ops.torch_ipex.parallel_scale_back_batch
@@ -4,7 +4,7 @@
 import intel_pytorch_extension as ipex
 from common_utils import TestCase
 import time, sys
-from intel_pytorch_extension import batch_score_nms, parallel_scale_back_batch
+from intel_pytorch_extension import batch_score_nms, parallel_scale_back_batch, nms
 import torch.nn.functional as F
 import os
 
@@ -118,17 +118,16 @@ def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200)
         max_ids = max_ids[-max_output:]
         return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
 
-    def test_nms_result(self):
+    def test_batch_nms_result(self):
         batch_size = 1
         number_boxes = 15130
         scale_xy = 0.1
         scale_wh = 0.2
         criteria = 0.50
         max_output = 200
-        predicted_loc = torch.randn((batch_size, number_boxes, 4)).contiguous().to(torch.float32)
-        predicted_score = torch.randn((batch_size, number_boxes, 81)).contiguous().to(torch.float32)
-        dboxes_xywh = torch.randn((1, number_boxes, 4)).contiguous().to(torch.float64)
-        dboxes_xywh = torch.load(os.path.dirname(__file__) + "/data/nms_dboxes_xywh.pt")
+        predicted_loc = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_ploc.pt")) # sizes: [1, 15130, 4]
+        predicted_score = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_plabel.pt")) # sizes: [1, 15130, 81]
+        dboxes_xywh = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_dboxes_xywh.pt"))
         bboxes, probs = parallel_scale_back_batch(predicted_loc, predicted_score, dboxes_xywh, scale_xy, scale_wh)
         bboxes_clone = bboxes.clone()
         probs_clone = probs.clone()
@@ -147,5 +146,46 @@ def test_nms_result(self):
             self.assertEqual(label, label2)
             self.assertTrue(torch.allclose(prob, prob2, rtol=1e-4, atol=1e-4))
 
+    def test_nms_kernel_result(self):
+        batch_size = 1
+        class_number = 81
+        scale_xy = 0.1
+        scale_wh = 0.2
+        criteria = 0.50
+        max_output = 200
+        predicted_loc = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_ploc.pt")) # sizes: [1, 15130, 4]
+        predicted_score = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_plabel.pt")) # sizes: [1, 15130, 81]
+        dboxes_xywh = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_dboxes_xywh.pt"))
+        bboxes, probs = parallel_scale_back_batch(predicted_loc, predicted_score, dboxes_xywh, scale_xy, scale_wh)
+
+        for bs in range(batch_size):
+            loc = bboxes[bs].squeeze(0)
+            for class_id in range(class_number):
+                if class_id == 0:
+                    # Skip the background
+                    continue
+                score = probs[bs, :, class_id]
+
+                score_sorted, indices = torch.sort(score, descending=True)
+                loc_sorted = torch.index_select(loc, 0, indices)
+
+                result = nms(loc_sorted.clone(), score_sorted.clone(), criteria, True)
+                result_ref = nms(loc.clone(), score.clone(), criteria, False)
+                result_ref2 = nms(loc_sorted.clone().to(dtype=torch.float64), score_sorted.clone().to(dtype=torch.float64), criteria, True)
+
+                bbox_keep, _ = torch.sort(torch.index_select(loc_sorted, 0, result).squeeze(0), 0)
+                bbox_keep_ref, _ = torch.sort(torch.index_select(loc, 0, result_ref).squeeze(0), 0)
+                bbox_keep_ref2, _ = torch.sort(torch.index_select(loc_sorted, 0, result_ref2).squeeze(0), 0)
+
+                score_keep, _ = torch.sort(torch.index_select(score_sorted, 0, result).squeeze(0), 0)
+                score_keep_ref, _ = torch.sort(torch.index_select(score, 0, result_ref).squeeze(0), 0)
+                score_keep_ref2, _ = torch.sort(torch.index_select(score_sorted, 0, result_ref2).squeeze(0), 0)
+
+                self.assertEqual(result.size(0), result_ref.size(0))
+                self.assertTrue(torch.allclose(bbox_keep, bbox_keep_ref, rtol=1e-4, atol=1e-4))
+                self.assertTrue(torch.allclose(score_keep, score_keep_ref, rtol=1e-4, atol=1e-4))
+                self.assertTrue(torch.allclose(bbox_keep, bbox_keep_ref2, rtol=1e-4, atol=1e-4))
+                self.assertTrue(torch.allclose(score_keep, score_keep_ref2, rtol=1e-4, atol=1e-4))
+
 if __name__ == '__main__':
     test = unittest.main()
@@ -26,12 +26,21 @@ class AtenIpexTypeExt {
                                       const int64_t height,
                                       const int64_t width,
                                       const int64_t sampling_ratio);
-    
+
+  /// \brief Perform non-maximum suppression.
+  ///
+  /// \param dets: predicted loc in ltrb format for one batchsize, size [number_boxes, 4], for example: [200, 4].
+  /// \param scores: predicted score for one batchsize and one class, size [number_boxes], for example: [200].
+  /// \param threshold: IOU threshold(scalar) to suppress bboxs which has the IOU val larger than the threshold.
+  /// \param sorted: The score and dets are already sorted in Descending order.
+  ///
+  /// \return result is a Tensor of dets' indexs to be keeped.
   static at::Tensor nms(const at::Tensor& dets,
                         const at::Tensor& scores,
-                        const double threshold);
+                        const double threshold,
+                        const bool sorted);
 
-  /// \brief Perform non-maximum suppression.
+  /// \brief Perform batch non-maximum suppression.
   ///
   /// C++ version of Encoder::decode_single.
   /// Refer to https://github.com/mlcommons/inference/blob/v0.7/others/cloud/single_stage_detector/pytorch/utils.py.