Skip to content

CLN: Rename "add" to "sum" in groupby #47892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def group_any_all(
val_test: Literal["any", "all"],
skipna: bool,
) -> None: ...
def group_add(
def group_sum(
out: np.ndarray, # complexfloating_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[complexfloating_t, ndim=2]
Expand Down
30 changes: 15 additions & 15 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def group_median_float64(
ndarray[intp_t] indexer
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"
assert min_count == -1, "'min_count' only used in sum and prod"

ngroups = len(counts)
N, K = (<object>values).shape
Expand Down Expand Up @@ -502,7 +502,7 @@ def group_any_all(


# ----------------------------------------------------------------------
# group_add, group_prod, group_var, group_mean, group_ohlc
# group_sum, group_prod, group_var, group_mean, group_ohlc
# ----------------------------------------------------------------------

ctypedef fused mean_t:
Expand All @@ -511,17 +511,17 @@ ctypedef fused mean_t:
complex64_t
complex128_t

ctypedef fused add_t:
ctypedef fused sum_t:
mean_t
object


@cython.wraparound(False)
@cython.boundscheck(False)
def group_add(
add_t[:, ::1] out,
def group_sum(
sum_t[:, ::1] out,
int64_t[::1] counts,
ndarray[add_t, ndim=2] values,
ndarray[sum_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
Expand All @@ -531,8 +531,8 @@ def group_add(
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
add_t val, t, y
add_t[:, ::1] sumx, compensation
sum_t val, t, y
sum_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)

Expand All @@ -546,7 +546,7 @@ def group_add(

N, K = (<object>values).shape

if add_t is object:
if sum_t is object:
# NB: this does not use 'compensation' like the non-object track does.
for i in range(N):
lab = labels[i]
Expand Down Expand Up @@ -588,10 +588,10 @@ def group_add(

# not nan
# With dt64/td64 values, values have been cast to float64
# instead if int64 for group_add, but the logic
# instead if int64 for group_sum, but the logic
# is otherwise the same as in _treat_as_na
if val == val and not (
add_t is float64_t
sum_t is float64_t
and is_datetimelike
and val == <float64_t>NPY_NAT
):
Expand Down Expand Up @@ -677,7 +677,7 @@ def group_var(
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)

assert min_count == -1, "'min_count' only used in add and prod"
assert min_count == -1, "'min_count' only used in sum and prod"

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand Down Expand Up @@ -745,7 +745,7 @@ def group_mean(
Array containing unique label for each group, with its
ordering matching up to the corresponding record in `values`.
min_count : Py_ssize_t
Only used in add and prod. Always -1.
Only used in sum and prod. Always -1.
is_datetimelike : bool
True if `values` contains datetime-like entries.
mask : ndarray[bool, ndim=2], optional
Expand All @@ -766,7 +766,7 @@ def group_mean(
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)

assert min_count == -1, "'min_count' only used in add and prod"
assert min_count == -1, "'min_count' only used in sum and prod"

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand Down Expand Up @@ -821,7 +821,7 @@ def group_ohlc(
Py_ssize_t i, j, N, K, lab
floating val

assert min_count == -1, "'min_count' only used in add and prod"
assert min_count == -1, "'min_count' only used in sum and prod"

if len(labels) == 0:
return
Expand Down
8 changes: 2 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1338,7 +1338,6 @@ def _resolve_numeric_only(

if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
# GH#47500
how = "sum" if how == "add" else how
warnings.warn(
f"{type(self).__name__}.{how} called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "
Expand Down Expand Up @@ -1738,9 +1737,8 @@ def _cython_agg_general(
kwd_name = "numeric_only"
if how in ["any", "all"]:
kwd_name = "bool_only"
kernel = "sum" if how == "add" else how
raise NotImplementedError(
f"{type(self).__name__}.{kernel} does not implement {kwd_name}."
f"{type(self).__name__}.{how} does not implement {kwd_name}."
)
elif not is_ser:
data = data.get_numeric_data(copy=False)
Expand Down Expand Up @@ -2417,7 +2415,7 @@ def sum(
result = self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
alias="add",
alias="sum",
npfunc=np.sum,
)

Expand Down Expand Up @@ -4341,8 +4339,6 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde


def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None:
if how == "add":
how = "sum"
if numeric_only is not lib.no_default and not numeric_only:
# numeric_only was specified and falsey but still dropped nuisance columns
warnings.warn(
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:

_CYTHON_FUNCTIONS = {
"aggregate": {
"add": "group_add",
"sum": "group_sum",
"prod": "group_prod",
"min": "group_min",
"max": "group_max",
Expand Down Expand Up @@ -213,7 +213,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
values = ensure_float64(values)

elif values.dtype.kind in ["i", "u"]:
if how in ["add", "var", "prod", "mean", "ohlc"] or (
if how in ["sum", "var", "prod", "mean", "ohlc"] or (
self.kind == "transform" and self.has_dropped_na
):
# result may still include NaN, so we have to cast
Expand Down Expand Up @@ -241,7 +241,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
if isinstance(dtype, CategoricalDtype):
# NotImplementedError for methods that can fall back to a
# non-cython implementation.
if how in ["add", "prod", "cumsum", "cumprod"]:
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"{dtype} type does not support {how} operations")
elif how not in ["rank"]:
# only "rank" is implemented in cython
Expand All @@ -258,7 +258,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
# TODO: same for period_dtype? no for these methods with Period
# we raise NotImplemented if this is an invalid operation
# entirely, e.g. adding datetimes
if how in ["add", "prod", "cumsum", "cumprod"]:
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"datetime64 type does not support {how} operations")
elif is_timedelta64_dtype(dtype):
if how in ["prod", "cumprod"]:
Expand Down Expand Up @@ -311,7 +311,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
"""
how = self.how

if how in ["add", "cumsum", "sum", "prod"]:
if how in ["sum", "cumsum", "sum", "prod"]:
if dtype == np.dtype(bool):
return np.dtype(np.int64)
elif how in ["mean", "median", "var"]:
Expand Down Expand Up @@ -567,7 +567,7 @@ def _call_cython_op(
result_mask=result_mask,
is_datetimelike=is_datetimelike,
)
elif self.how in ["add"]:
elif self.how in ["sum"]:
# We support datetimelike
func(
out=result,
Expand Down Expand Up @@ -625,7 +625,7 @@ def _call_cython_op(
# e.g. if we are int64 and need to restore to datetime64/timedelta64
# "rank" is the only member of cast_blocklist we get here
# Casting only needed for float16, bool, datetimelike,
# and self.how in ["add", "prod", "ohlc", "cumprod"]
# and self.how in ["sum", "prod", "ohlc", "cumprod"]
res_dtype = self._get_result_dtype(orig_values.dtype)
op_result = maybe_downcast_to_dtype(result, res_dtype)
else:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def test_cython_fail_agg():
("mean", np.mean),
("median", np.median),
("var", np.var),
("add", np.sum),
("sum", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
Expand Down Expand Up @@ -214,7 +214,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"add", alt=None, numeric_only=True
"sum", alt=None, numeric_only=True
)
intervals = pd.interval_range(0, 20, freq=5, inclusive="right")
expected = DataFrame(
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ def test_custom_grouper(index):

# check all cython functions work
g.ohlc() # doesn't use _cython_agg_general
funcs = ["add", "mean", "prod", "min", "max", "var"]
funcs = ["sum", "mean", "prod", "min", "max", "var"]
for f in funcs:
g._cython_agg_general(f, alt=None, numeric_only=True)

b = Grouper(freq=Minute(5), closed="right", label="right")
g = s.groupby(b)
# check all cython functions work
g.ohlc() # doesn't use _cython_agg_general
funcs = ["add", "mean", "prod", "min", "max", "var"]
funcs = ["sum", "mean", "prod", "min", "max", "var"]
for f in funcs:
g._cython_agg_general(f, alt=None, numeric_only=True)

Expand Down Expand Up @@ -414,7 +414,7 @@ def test_resample_upsampling_picked_but_not_correct():
tm.assert_series_equal(result2, expected)


@pytest.mark.parametrize("f", ["add", "mean", "prod", "min", "max", "var"])
@pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"])
def test_resample_frame_basic_cy_funcs(f):
df = tm.makeTimeDataFrame()

Expand Down