Working avg tiebreak with nan handling

WillAyd · WillAyd · commit b4c3dfd5e111 · 2018-02-05T12:33:58.000-08:00
diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
@@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
     a[0] = b[0]
     b[0] = t
     return 0
+
+cdef:
+    int TIEBREAK_AVERAGE = 0
+    int TIEBREAK_MIN = 1
+    int TIEBREAK_MAX = 2
+    int TIEBREAK_FIRST = 3
+    int TIEBREAK_FIRST_DESCENDING = 4
+    int TIEBREAK_DENSE = 5
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -13,11 +13,13 @@ from numpy cimport (ndarray,
                     int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
                     uint32_t, uint64_t, float32_t, float64_t)
 
+from libc.math cimport isnan
 from libc.stdlib cimport malloc, free
 
 from util cimport numeric, get_nat
-from algos cimport swap
-from algos import take_2d_axis1_float64_float64, groupsort_indexer
+from algos cimport (swap, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX,
+                    TIEBREAK_FIRST, TIEBREAK_DENSE)
+from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers
 
 cdef int64_t iNaT = get_nat()
 
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -444,7 +444,6 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = resx[i, j]
 
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
@@ -455,27 +454,35 @@ def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     Only transforms on axis=0
     """
     cdef:
+        int tiebreak
         Py_ssize_t i, j, N, K
-        int64_t lab, idx, counter=1
+        int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0
         ndarray[int64_t] _as
 
+    tiebreak = tiebreakers[kwargs['ties_method']]
     N, K = (<object> values).shape
 
     _as = np.lexsort((values[:, 0], labels))
 
     with nogil:
         for i in range(N):
-            idx = _as[i]
-            lab = labels[idx]
-            if i > 0 and lab == labels[_as[i-1]]:
-                counter += 1
-            else:
-                counter = 1
-            if lab < 0:
-                continue
-
-            for j in range(K):
-                out[idx, j] = counter
+            dups += 1
+            sum_ranks += i - grp_start + 1
+
+            if tiebreak == TIEBREAK_AVERAGE:
+                for j in range(i - dups + 1, i + 1):
+                    out[_as[j], 0] = sum_ranks / dups
+
+            if (i == N - 1 or (
+                    (values[_as[i], 0] != values[_as[i+1], 0]) and not
+                    (isnan(values[_as[i], 0]) and
+                     isnan(values[_as[i+1], 0])
+                    ))):
+                dups = sum_ranks = 0
+                val_start = i
+
+            if i == 0 or labels[_as[i]] != labels[_as[i-1]]:
+                grp_start = i
 
 {{endfor}}
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1770,10 +1770,10 @@ def cumcount(self, ascending=True):
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
-    def rank(self, ties_method='average', ascending=True, na_option='keep',
+    def rank(self, method='average', ascending=True, na_option='keep',
              pct=False, axis=0):
         """Rank within each group"""
-        return self._cython_transform('rank', ties_method=ties_method,
+        return self._cython_transform('rank', ties_method=method,
                                       ascending=ascending, na_option=na_option,
                                       pct=pct, axis=axis)