-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
REF: de-duplicate groupby_helper code #28934
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
af8f8c3
43772af
bea1cc5
e638c89
fd3a673
a61e7f5
1803844
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,18 @@ ctypedef fused rank_t: | |
object | ||
|
||
|
||
cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: | ||
if rank_t is object: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Somewhat counter-intuitive that this works - out of curiosity what was the complaint? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAICT the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn’t need the gil if these are c level object (eg ints or floats) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, we just need to exclude the object case from getting to the |
||
# Should never be used, but we need to avoid the `val != val` below | ||
# or else cython will raise about gil acquisition. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is pretty odd, why dont you return an enum here (true, false, raise) and handle in code appropriately. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the vicissitudes of cython fused types |
||
raise NotImplementedError | ||
|
||
elif rank_t is int64_t: | ||
return is_datetimelike and val == NPY_NAT | ||
else: | ||
return val != val | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def group_last(rank_t[:, :] out, | ||
|
@@ -61,24 +73,16 @@ def group_last(rank_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if rank_t is int64_t: | ||
# need a special notna check | ||
if val != NPY_NAT: | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
else: | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
if val == val: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this totally equivalent to what's in place? Looks like we are losing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is in the |
||
# NB: use _treat_as_na here once | ||
# conditional-nogil is available. | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
if nobs[i, j] == 0: | ||
if rank_t is int64_t: | ||
out[i, j] = NPY_NAT | ||
else: | ||
out[i, j] = NAN | ||
out[i, j] = NAN | ||
else: | ||
out[i, j] = resx[i, j] | ||
else: | ||
|
@@ -92,16 +96,10 @@ def group_last(rank_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if rank_t is int64_t: | ||
# need a special notna check | ||
if val != NPY_NAT: | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
else: | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
if not _treat_as_na(val, True): | ||
# TODO: Sure we always want is_datetimelike=True? | ||
nobs[lab, j] += 1 | ||
resx[lab, j] = val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
|
@@ -113,6 +111,7 @@ def group_last(rank_t[:, :] out, | |
break | ||
else: | ||
out[i, j] = NAN | ||
|
||
else: | ||
out[i, j] = resx[i, j] | ||
|
||
|
@@ -121,7 +120,6 @@ def group_last(rank_t[:, :] out, | |
# block. | ||
raise RuntimeError("empty group with uint64_t") | ||
|
||
|
||
group_last_float64 = group_last["float64_t"] | ||
group_last_float32 = group_last["float32_t"] | ||
group_last_int64 = group_last["int64_t"] | ||
|
@@ -169,8 +167,9 @@ def group_nth(rank_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if val == val: | ||
# NB: use _treat_as_na here once | ||
# conditional-nogil is available. | ||
nobs[lab, j] += 1 | ||
if nobs[lab, j] == rank: | ||
resx[lab, j] = val | ||
|
@@ -193,18 +192,11 @@ def group_nth(rank_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if rank_t is int64_t: | ||
# need a special notna check | ||
if val != NPY_NAT: | ||
nobs[lab, j] += 1 | ||
if nobs[lab, j] == rank: | ||
resx[lab, j] = val | ||
else: | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
if nobs[lab, j] == rank: | ||
resx[lab, j] = val | ||
if not _treat_as_na(val, True): | ||
# TODO: Sure we always want is_datetimelike=True? | ||
nobs[lab, j] += 1 | ||
if nobs[lab, j] == rank: | ||
resx[lab, j] = val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
|
@@ -487,17 +479,11 @@ def group_max(groupby_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if groupby_t is int64_t: | ||
if val != nan_val: | ||
nobs[lab, j] += 1 | ||
if val > maxx[lab, j]: | ||
maxx[lab, j] = val | ||
else: | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
if val > maxx[lab, j]: | ||
maxx[lab, j] = val | ||
if not _treat_as_na(val, True): | ||
# TODO: Sure we always want is_datetimelike=True? | ||
nobs[lab, j] += 1 | ||
if val > maxx[lab, j]: | ||
maxx[lab, j] = val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
|
@@ -563,17 +549,11 @@ def group_min(groupby_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# not nan | ||
if groupby_t is int64_t: | ||
if val != nan_val: | ||
nobs[lab, j] += 1 | ||
if val < minx[lab, j]: | ||
minx[lab, j] = val | ||
else: | ||
if val == val: | ||
nobs[lab, j] += 1 | ||
if val < minx[lab, j]: | ||
minx[lab, j] = val | ||
if not _treat_as_na(val, True): | ||
# TODO: Sure we always want is_datetimelike=True? | ||
nobs[lab, j] += 1 | ||
if val < minx[lab, j]: | ||
minx[lab, j] = val | ||
|
||
for i in range(ncounts): | ||
for j in range(K): | ||
|
@@ -643,21 +623,13 @@ def group_cummin(groupby_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
# val = nan | ||
if groupby_t is int64_t: | ||
if is_datetimelike and val == NPY_NAT: | ||
out[i, j] = NPY_NAT | ||
else: | ||
mval = accum[lab, j] | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
if _treat_as_na(val, is_datetimelike): | ||
out[i, j] = val | ||
else: | ||
if val == val: | ||
mval = accum[lab, j] | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
mval = accum[lab, j] | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
|
||
|
||
@cython.boundscheck(False) | ||
|
@@ -712,17 +684,10 @@ def group_cummax(groupby_t[:, :] out, | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
if groupby_t is int64_t: | ||
if is_datetimelike and val == NPY_NAT: | ||
out[i, j] = NPY_NAT | ||
else: | ||
mval = accum[lab, j] | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
if _treat_as_na(val, is_datetimelike): | ||
out[i, j] = val | ||
else: | ||
if val == val: | ||
mval = accum[lab, j] | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
mval = accum[lab, j] | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this only applicable in groupby or should it go in util?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only here. As I mention in comments below, I'm not wild about the fact that is_datetimelike is effectively hard-coded to True in several places. Will try to see if that causes problems in follow-up(s)