From af8f8c338c9faaefd65f43d75c6054757a38d5a0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 10:48:01 -0700 Subject: [PATCH 1/3] REF: implement _treat_as_na to de-duplicate na checks --- pandas/_libs/groupby.pyx | 3 +- pandas/_libs/groupby_helper.pxi.in | 133 +++++++++++------------------ 2 files changed, 53 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3069bbbf34bb7..c9994812462b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -372,7 +372,8 @@ def group_any_all(uint8_t[:] out, const uint8_t[:] mask, object val_test, bint skipna): - """Aggregated boolean values to show truthfulness of group elements + """ + Aggregated boolean values to show truthfulness of group elements. Parameters ---------- diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6b434b6470581..510803a5896f9 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -19,6 +19,18 @@ ctypedef fused rank_t: object +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + @cython.wraparound(False) @cython.boundscheck(False) def group_last(rank_t[:, :] out, @@ -59,16 +71,11 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -90,16 +97,10 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -111,6 +112,7 @@ def group_last(rank_t[:, :] out, else: out[i, j] = resx[i, j] + group_last_float64 = group_last["float64_t"] group_last_float32 = group_last["float32_t"] group_last_int64 = group_last["int64_t"] @@ -157,8 +159,9 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -181,18 +184,11 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if rank_t is int64_t: - # need a special notna check - if val != NPY_NAT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - else: - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -455,17 +451,11 @@ def group_max(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - if val == val and val != nan_val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -517,17 +507,11 @@ def group_min(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # not nan - if groupby_t is int64_t: - if val != nan_val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - if val == val and val != nan_val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -587,21 +571,13 @@ def group_cummin(groupby_t[:, :] out, for j in range(K): val = values[i, j] - # val = nan - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if _treat_as_na(val, is_datetimelike): + out[i, j] = val else: - if val == val: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval @cython.boundscheck(False) @@ -654,17 +630,10 @@ def group_cummax(groupby_t[:, :] out, for j in range(K): val = values[i, j] - if groupby_t is int64_t: - if is_datetimelike and val == NPY_NAT: - out[i, j] = NPY_NAT - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval + if _treat_as_na(val, is_datetimelike): + out[i, j] = val else: - if val == val: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval From 43772affbf3eee744569c97922fc8705e7010f99 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Oct 2019 11:30:44 -0700 Subject: [PATCH 2/3] use nan_val pattern --- pandas/_libs/groupby_helper.pxi.in | 31 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 510803a5896f9..86d3560d5b9a6 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -43,7 +43,7 @@ def group_last(rank_t[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val + rank_t val, nan_val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -58,6 +58,11 @@ def group_last(rank_t[:, :] out, else: resx = np.empty_like(out) + if rank_t is int64_t: + nan_val = NPY_NAT + else: + nan_val = NAN + N, K = (values).shape if rank_t is object: @@ -80,10 +85,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = resx[i, j] else: @@ -105,10 +107,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = resx[i, j] @@ -131,7 +130,7 @@ def group_nth(rank_t[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val + rank_t val, nan_val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -146,6 +145,11 @@ def group_nth(rank_t[:, :] out, else: resx = np.empty_like(out) + if rank_t is int64_t: + nan_val = NPY_NAT + else: + nan_val = NAN + N, K = (values).shape if rank_t is object: @@ -169,7 +173,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = resx[i, j] @@ -193,10 +197,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - else: - out[i, j] = NAN + out[i, j] = nan_val else: out[i, j] = resx[i, j] From a61e7f5c9854e7351996ffb07ce1952fa8603f8a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 09:23:19 -0700 Subject: [PATCH 3/3] post-merge fixup --- pandas/_libs/groupby_helper.pxi.in | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index bee6cbc4fc825..c837c6c5c6519 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -44,7 +44,7 @@ def group_last(rank_t[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val, nan_val + rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -60,11 +60,6 @@ def group_last(rank_t[:, :] out, else: resx = np.empty_like(out) - if rank_t is int64_t: - nan_val = NPY_NAT - else: - nan_val = NAN - N, K = (values).shape if rank_t is object: @@ -87,7 +82,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan_val + out[i, j] = NAN else: out[i, j] = resx[i, j] else: @@ -143,7 +138,7 @@ def group_nth(rank_t[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val, nan_val + rank_t val ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -159,11 +154,6 @@ def group_nth(rank_t[:, :] out, else: resx = np.empty_like(out) - if rank_t is int64_t: - nan_val = NPY_NAT - else: - nan_val = NAN - N, K = (values).shape if rank_t is object: @@ -187,7 +177,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = nan_val + out[i, j] = NAN else: out[i, j] = resx[i, j]