From af8f8c338c9faaefd65f43d75c6054757a38d5a0 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Fri, 11 Oct 2019 10:48:01 -0700
Subject: [PATCH 1/3] REF: implement _treat_as_na to de-duplicate na checks

---
 pandas/_libs/groupby.pyx           |   3 +-
 pandas/_libs/groupby_helper.pxi.in | 133 +++++++++++------------------
 2 files changed, 53 insertions(+), 83 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3069bbbf34bb7..c9994812462b1 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -372,7 +372,8 @@ def group_any_all(uint8_t[:] out,
                   const uint8_t[:] mask,
                   object val_test,
                   bint skipna):
-    """Aggregated boolean values to show truthfulness of group elements
+    """
+    Aggregated boolean values to show truthfulness of group elements.
 
     Parameters
     ----------
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 6b434b6470581..510803a5896f9 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -19,6 +19,18 @@ ctypedef fused rank_t:
     object
 
 
+cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
+    if rank_t is object:
+        # Should never be used, but we need to avoid the `val != val` below
+        #  or else cython will raise about gil acquisition.
+        raise NotImplementedError
+
+    elif rank_t is int64_t:
+        return is_datetimelike and val == NPY_NAT
+    else:
+        return val != val
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_last(rank_t[:, :] out,
@@ -59,16 +71,11 @@ def group_last(rank_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
-                if rank_t is int64_t:
-                    # need a special notna check
-                    if val != NPY_NAT:
-                        nobs[lab, j] += 1
-                        resx[lab, j] = val
-                else:
-                    if val == val:
-                        nobs[lab, j] += 1
-                        resx[lab, j] = val
+                if val == val:
+                    # NB: use _treat_as_na here once
+                    #  conditional-nogil is available.
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
@@ -90,16 +97,10 @@ def group_last(rank_t[:, :] out,
                 for j in range(K):
                     val = values[i, j]
 
-                    # not nan
-                    if rank_t is int64_t:
-                        # need a special notna check
-                        if val != NPY_NAT:
-                            nobs[lab, j] += 1
-                            resx[lab, j] = val
-                    else:
-                        if val == val:
-                            nobs[lab, j] += 1
-                            resx[lab, j] = val
+                    if not _treat_as_na(val, True):
+                        # TODO: Sure we always want is_datetimelike=True?
+                        nobs[lab, j] += 1
+                        resx[lab, j] = val
 
             for i in range(ncounts):
                 for j in range(K):
@@ -111,6 +112,7 @@ def group_last(rank_t[:, :] out,
                     else:
                         out[i, j] = resx[i, j]
 
+
 group_last_float64 = group_last["float64_t"]
 group_last_float32 = group_last["float32_t"]
 group_last_int64 = group_last["int64_t"]
@@ -157,8 +159,9 @@ def group_nth(rank_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
                 if val == val:
+                    # NB: use _treat_as_na here once
+                    #  conditional-nogil is available.
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
@@ -181,18 +184,11 @@ def group_nth(rank_t[:, :] out,
                 for j in range(K):
                     val = values[i, j]
 
-                    # not nan
-                    if rank_t is int64_t:
-                        # need a special notna check
-                        if val != NPY_NAT:
-                            nobs[lab, j] += 1
-                            if nobs[lab, j] == rank:
-                                resx[lab, j] = val
-                    else:
-                        if val == val:
-                            nobs[lab, j] += 1
-                            if nobs[lab, j] == rank:
-                                resx[lab, j] = val
+                    if not _treat_as_na(val, True):
+                        # TODO: Sure we always want is_datetimelike=True?
+                        nobs[lab, j] += 1
+                        if nobs[lab, j] == rank:
+                            resx[lab, j] = val
 
             for i in range(ncounts):
                 for j in range(K):
@@ -455,17 +451,11 @@ def group_max(groupby_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
-                if groupby_t is int64_t:
-                    if val != nan_val:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-                else:
-                    if val == val and val != nan_val:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
+                if not _treat_as_na(val, True):
+                    # TODO: Sure we always want is_datetimelike=True?
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
@@ -517,17 +507,11 @@ def group_min(groupby_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
-                if groupby_t is int64_t:
-                    if val != nan_val:
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-                else:
-                    if val == val and val != nan_val:
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
+                if not _treat_as_na(val, True):
+                    # TODO: Sure we always want is_datetimelike=True?
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
@@ -587,21 +571,13 @@ def group_cummin(groupby_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                # val = nan
-                if groupby_t is int64_t:
-                    if is_datetimelike and val == NPY_NAT:
-                        out[i, j] = NPY_NAT
-                    else:
-                        mval = accum[lab, j]
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                        out[i, j] = mval
+                if _treat_as_na(val, is_datetimelike):
+                    out[i, j] = val
                 else:
-                    if val == val:
-                        mval = accum[lab, j]
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                        out[i, j] = mval
+                    mval = accum[lab, j]
+                    if val < mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval
 
 
 @cython.boundscheck(False)
@@ -654,17 +630,10 @@ def group_cummax(groupby_t[:, :] out,
             for j in range(K):
                 val = values[i, j]
 
-                if groupby_t is int64_t:
-                    if is_datetimelike and val == NPY_NAT:
-                        out[i, j] = NPY_NAT
-                    else:
-                        mval = accum[lab, j]
-                        if val > mval:
-                            accum[lab, j] = mval = val
-                        out[i, j] = mval
+                if _treat_as_na(val, is_datetimelike):
+                    out[i, j] = val
                 else:
-                    if val == val:
-                        mval = accum[lab, j]
-                        if val > mval:
-                            accum[lab, j] = mval = val
-                        out[i, j] = mval
+                    mval = accum[lab, j]
+                    if val > mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval

From 43772affbf3eee744569c97922fc8705e7010f99 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Fri, 11 Oct 2019 11:30:44 -0700
Subject: [PATCH 2/3] use nan_val pattern

---
 pandas/_libs/groupby_helper.pxi.in | 31 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 510803a5896f9..86d3560d5b9a6 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -43,7 +43,7 @@ def group_last(rank_t[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
+        rank_t val, nan_val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -58,6 +58,11 @@ def group_last(rank_t[:, :] out,
     else:
         resx = np.empty_like(out)
 
+    if rank_t is int64_t:
+        nan_val = NPY_NAT
+    else:
+        nan_val = NAN
+
     N, K = (<object>values).shape
 
     if rank_t is object:
@@ -80,10 +85,7 @@ def group_last(rank_t[:, :] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    if rank_t is int64_t:
-                        out[i, j] = NPY_NAT
-                    else:
-                        out[i, j] = NAN
+                    out[i, j] = nan_val
                 else:
                     out[i, j] = resx[i, j]
     else:
@@ -105,10 +107,7 @@ def group_last(rank_t[:, :] out,
             for i in range(ncounts):
                 for j in range(K):
                     if nobs[i, j] == 0:
-                        if rank_t is int64_t:
-                            out[i, j] = NPY_NAT
-                        else:
-                            out[i, j] = NAN
+                        out[i, j] = nan_val
                     else:
                         out[i, j] = resx[i, j]
 
@@ -131,7 +130,7 @@ def group_nth(rank_t[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
+        rank_t val, nan_val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -146,6 +145,11 @@ def group_nth(rank_t[:, :] out,
     else:
         resx = np.empty_like(out)
 
+    if rank_t is int64_t:
+        nan_val = NPY_NAT
+    else:
+        nan_val = NAN
+
     N, K = (<object>values).shape
 
     if rank_t is object:
@@ -169,7 +173,7 @@ def group_nth(rank_t[:, :] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = NAN
+                    out[i, j] = nan_val
                 else:
                     out[i, j] = resx[i, j]
 
@@ -193,10 +197,7 @@ def group_nth(rank_t[:, :] out,
             for i in range(ncounts):
                 for j in range(K):
                     if nobs[i, j] == 0:
-                        if rank_t is int64_t:
-                            out[i, j] = NPY_NAT
-                        else:
-                            out[i, j] = NAN
+                        out[i, j] = nan_val
                     else:
                         out[i, j] = resx[i, j]
 

From a61e7f5c9854e7351996ffb07ce1952fa8603f8a Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 09:23:19 -0700
Subject: [PATCH 3/3] post-merge fixup

---
 pandas/_libs/groupby_helper.pxi.in | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index bee6cbc4fc825..c837c6c5c6519 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -44,7 +44,7 @@ def group_last(rank_t[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val, nan_val
+        rank_t val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
         bint runtime_error = False
@@ -60,11 +60,6 @@ def group_last(rank_t[:, :] out,
     else:
         resx = np.empty_like(out)
 
-    if rank_t is int64_t:
-        nan_val = NPY_NAT
-    else:
-        nan_val = NAN
-
     N, K = (<object>values).shape
 
     if rank_t is object:
@@ -87,7 +82,7 @@ def group_last(rank_t[:, :] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = nan_val
+                    out[i, j] = NAN
                 else:
                     out[i, j] = resx[i, j]
     else:
@@ -143,7 +138,7 @@ def group_nth(rank_t[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val, nan_val
+        rank_t val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
         bint runtime_error = False
@@ -159,11 +154,6 @@ def group_nth(rank_t[:, :] out,
     else:
         resx = np.empty_like(out)
 
-    if rank_t is int64_t:
-        nan_val = NPY_NAT
-    else:
-        nan_val = NAN
-
     N, K = (<object>values).shape
 
     if rank_t is object:
@@ -187,7 +177,7 @@ def group_nth(rank_t[:, :] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = nan_val
+                    out[i, j] = NAN
                 else:
                     out[i, j] = resx[i, j]