From 97d7e11082270d5813fc5c700746bd753c410b09 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 6 Jun 2021 16:56:17 -0400 Subject: [PATCH 1/3] precommit fixup --- pandas/_libs/groupby.pyx | 89 ++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b72b927b3c2a8..40933c8f30319 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1345,13 +1345,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval + Py_ssize_t N, K groupby_t[:, ::1] accum - intp_t lab - bint val_is_nan, use_mask - - use_mask = mask is not None N, K = (values).shape accum = np.empty((ngroups, K), dtype=values.dtype) @@ -1362,36 +1357,78 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf + if mask is not None: + masked_cummin_max(out, values, mask, labels, accum, N, K, compute_max) + else: + cummin_max(out, values, labels, accum, N, K, is_datetimelike, compute_max) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + Py_ssize_t N, + Py_ssize_t K, + bint is_datetimelike, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels`. + """ + cdef: + Py_ssize_t i, j + groupby_t val, mval + intp_t lab + with nogil: for i in range(N): lab = labels[i] - if lab < 0: continue for j in range(K): - val_is_nan = False - - if use_mask: - if mask[i, j]: - - # `out` does not need to be set since it - # will be masked anyway - val_is_nan = True + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + out[i, j] = val - # If using the mask, we can avoid grabbing the - # value unless necessary - val = values[i, j] - # Otherwise, `out` must be set accordingly if the - # value is missing - else: - val = values[i, j] - if _treat_as_na(val, is_datetimelike): - val_is_nan = True - out[i, j] = val +@cython.boundscheck(False) +@cython.wraparound(False) +cdef masked_cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + Py_ssize_t N, + Py_ssize_t K, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels` with a masked algorithm. + """ + cdef: + Py_ssize_t i, j + groupby_t val, mval + intp_t lab - if not val_is_nan: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + for j in range(K): + if not mask[i, j]: + val = values[i, j] mval = accum[lab, j] if compute_max: if val > mval: From 885f8ae956e918c6cfe360bd2cb66aebbd5879cc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 6 Jun 2021 20:18:47 -0400 Subject: [PATCH 2/3] wip --- pandas/_libs/groupby.pyx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 40933c8f30319..ee57ff5c2a205 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1358,9 +1358,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, accum[:] = -np.inf if compute_max else np.inf if mask is not None: - masked_cummin_max(out, values, mask, labels, accum, N, K, compute_max) + masked_cummin_max(out, values, mask, labels, accum, compute_max) else: - cummin_max(out, values, labels, accum, N, K, is_datetimelike, compute_max) + cummin_max(out, values, labels, accum, is_datetimelike, compute_max) @cython.boundscheck(False) @@ -1369,8 +1369,6 @@ cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, bint is_datetimelike, bint compute_max): """ @@ -1378,10 +1376,11 @@ cdef cummin_max(groupby_t[:, ::1] out, `labels`. """ cdef: - Py_ssize_t i, j + Py_ssize_t i, j, N, K groupby_t val, mval intp_t lab + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] @@ -1409,18 +1408,17 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, bint compute_max): """ Compute the cumulative minimum/maximum of columns of `values`, in row groups `labels` with a masked algorithm. """ cdef: - Py_ssize_t i, j + Py_ssize_t i, j, N, K groupby_t val, mval intp_t lab + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] From 51f8f9c8ccd0def2c0a4f93dc398a89ab94e986e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 11:00:19 -0400 Subject: [PATCH 3/3] Remove unused --- pandas/_libs/groupby.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ee57ff5c2a205..0e0598c3264e8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1345,11 +1345,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t N, K groupby_t[:, ::1] accum - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=values.dtype) + accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: