Skip to content

DEPR: Change numeric_only to False in various groupby ops #49892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ Removal of prior version deprecations/changes
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`)
- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`)
Copy link
Member Author

@rhshadrach rhshadrach Nov 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still some ops enforce numeric_only=False, mostly those that use _op_via_apply. Will be changing this to

The default of ``numeric_only`` is now ``False`` across all :class:`.DataFrameGroupBy` methods

once all ops are done.

- Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
-

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2219,7 +2219,7 @@ def skew(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
**kwargs,
) -> DataFrame:
result = self._op_via_apply(
Expand Down
40 changes: 22 additions & 18 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2136,7 +2136,7 @@ def mean(
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
def median(self, numeric_only: bool = False):
"""
Compute median of groups, excluding missing values.

Expand Down Expand Up @@ -2173,7 +2173,7 @@ def std(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
):
"""
Compute standard deviation of groups, excluding missing values.
Expand Down Expand Up @@ -2202,11 +2202,15 @@ def std(

.. versionadded:: 1.4.0

numeric_only : bool, default True
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. versionchanged:: 2.0.0

numeric_only now defaults to ``False``.

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2236,7 +2240,6 @@ def std(
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
)
self._maybe_warn_numeric_only_depr("std", result, numeric_only)
return result

@final
Expand All @@ -2247,7 +2250,7 @@ def var(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
):
"""
Compute variance of groups, excluding missing values.
Expand Down Expand Up @@ -2276,11 +2279,15 @@ def var(

.. versionadded:: 1.4.0

numeric_only : bool, default True
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. versionchanged:: 2.0.0

numeric_only now defaults to ``False``.

Returns
-------
Series or DataFrame
Expand All @@ -2301,7 +2308,7 @@ def var(
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
def sem(self, ddof: int = 1, numeric_only: bool = False):
"""
Compute standard error of the mean of groups, excluding missing values.

Expand All @@ -2317,23 +2324,22 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default

.. versionadded:: 1.5.0

.. versionchanged:: 2.0.0

numeric_only now defaults to ``False``.

Returns
-------
Series or DataFrame
Standard error of the mean of values within each group.
"""
# Reolve numeric_only so that std doesn't warn
numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
if (
numeric_only_bool
and self.obj.ndim == 1
and not is_numeric_dtype(self.obj.dtype)
):
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
raise TypeError(
f"{type(self).__name__}.sem called with "
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
)
result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
result = self.std(ddof=ddof, numeric_only=numeric_only)
self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is self._maybe_warn_numeric_only_depr still needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe not, but I have another branch built on this that removes it (and should be the final PR for nuisance deprecations).


if result.ndim == 1:
Expand Down Expand Up @@ -2411,10 +2417,8 @@ def sum(
return self._reindex_output(result, fill_value=0)

@final
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
def prod(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
):
@doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
def prod(self, numeric_only: bool = False, min_count: int = 0):
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
)
Expand Down
10 changes: 2 additions & 8 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype
[[1, 2, 3, 4, 5, 6]] * 3,
columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
).astype({("a", "j"): dtype, ("b", "j"): dtype})
warn = FutureWarning if func == "std" else None
msg = "The default value of numeric_only"
with tm.assert_produces_warning(warn, match=msg):
result = df.groupby(level=1, axis=1).agg(func)
Comment on lines -242 to -245
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 1.5.x we are currently warning in a few erroneous cases with axis=1. Is this worth fixing? I think the PR would have to go straight into 1.5.x because of the enforced deprecations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say if we haven't gotten an issue about this yet I think it's okay not fixing for now

result = df.groupby(level=1, axis=1).agg(func)
expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
result_dtype_dict
)
Expand All @@ -266,10 +263,7 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
columns=Index([10, 20, 10, 20], name="x"),
dtype="int64",
).astype({10: "Int64"})
warn = FutureWarning if func == "std" else None
msg = "The default value of numeric_only"
with tm.assert_produces_warning(warn, match=msg):
result = df.groupby("x", axis=1).agg(func)
result = df.groupby("x", axis=1).agg(func)
expected = DataFrame(
data=expected_data,
index=Index([0, 1, 0], name="y"),
Expand Down
31 changes: 17 additions & 14 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,9 @@ def test_averages(self, df, method):
],
)

if method == "mean":
with pytest.raises(TypeError, match="[Cc]ould not convert"):
getattr(gb, method)()
result = getattr(gb, method)(numeric_only=True)
else:
result = getattr(gb, method)()
with pytest.raises(TypeError, match="[Cc]ould not convert"):
getattr(gb, method)()
result = getattr(gb, method)(numeric_only=True)
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -276,11 +273,12 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
elif method in ("sum", "mean"):
elif method in ("sum", "mean", "median", "prod"):
msg = "|".join(
[
"category type does not support sum operations",
"Could not convert",
"[Cc]ould not convert",
"can't multiply sequence by non-int of type 'str'",
]
)
with pytest.raises(exception, match=msg):
Expand Down Expand Up @@ -1397,18 +1395,18 @@ def test_groupby_sum_timedelta_with_nat():
("last", False, True),
("max", False, True),
("mean", False, True),
("median", True, True),
("median", False, True),
("min", False, True),
("nth", False, False),
("nunique", False, False),
("pct_change", False, False),
("prod", True, True),
("prod", False, True),
("quantile", True, True),
("sem", True, True),
("skew", True, True),
("std", True, True),
("sem", False, True),
("skew", False, True),
("std", False, True),
("sum", False, True),
("var", True, True),
("var", False, True),
],
)
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
Expand Down Expand Up @@ -1592,6 +1590,11 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
warn_msg = ""
err_category = TypeError
err_msg = "Series.skew does not allow numeric_only=True with non-numeric"
elif groupby_func == "sem":
warn_category = None
warn_msg = ""
err_category = TypeError
err_msg = "called with numeric_only=True and dtype object"
else:
warn_category = FutureWarning
warn_msg = "This will raise a TypeError"
Expand Down
98 changes: 29 additions & 69 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
import pytest

from pandas._libs import lib
from pandas.compat import IS64
from pandas.errors import (
PerformanceWarning,
Expand Down Expand Up @@ -909,64 +908,37 @@ def test_keep_nuisance_agg(df, agg_function):
"agg_function",
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
# GH 38774, GH 38815
if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"):
# sum doesn't drop strings
warn = FutureWarning
else:
warn = None

grouped = df.groupby("A")

no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and numeric_only is False:
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
if numeric_only is lib.no_default:
msg = (
f"The default value of numeric_only in DataFrameGroupBy.{agg_function}"
)
else:
msg = "Dropping invalid columns"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
if (
(numeric_only is lib.no_default or not numeric_only)
# These methods drop non-numeric columns even when numeric_only is False
and agg_function not in ("mean", "prod", "median")
):
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
if not numeric_only and agg_function == "sum":
# sum is successful on column B
columns = ["A", "B", "C", "D"]
else:
columns = ["A", "C", "D"]
if agg_function == "sum" and numeric_only is False:
# sum doesn't drop nuisance string columns
warn = None
elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True:
warn = FutureWarning
else:
warn = None
msg = "The default value of numeric_only"
with tm.assert_produces_warning(warn, match=msg):
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
numeric_only=numeric_only
)
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
numeric_only=numeric_only
)
tm.assert_frame_equal(result, expected)


def test_omit_nuisance_warnings(df):
def test_raise_on_nuisance_python_single(df):
# GH 38815
with tm.assert_produces_warning(FutureWarning, filter_level="always"):
grouped = df.groupby("A")
result = grouped.skew()
expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew()
tm.assert_frame_equal(result, expected)
grouped = df.groupby("A")
with pytest.raises(TypeError, match="could not convert"):
grouped.skew()


def test_raise_on_nuisance_python_multiple(three_group):
Expand Down Expand Up @@ -2012,14 +1984,9 @@ def get_result(**kwargs):
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
if op == "sum":
with pytest.raises(
TypeError, match="datetime64 type does not support"
):
get_result()
result = get_result(numeric_only=True)
else:
result = get_result()
with pytest.raises(TypeError, match="datetime64 type does not support"):
get_result()
result = get_result(numeric_only=True)

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
Expand All @@ -2030,14 +1997,9 @@ def get_result(**kwargs):
elif isinstance(values, Categorical):
# GH#41291
# Categorical doesn't implement sum or prod
if op == "sum":
with pytest.raises(
TypeError, match="category type does not support"
):
get_result()
result = get_result(numeric_only=True)
else:
result = get_result()
with pytest.raises(TypeError, match="category type does not support"):
get_result()
result = get_result(numeric_only=True)

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
Expand All @@ -2053,24 +2015,22 @@ def get_result(**kwargs):
return

elif df.dtypes[0] == object:
# FIXME: the test is actually wrong here, xref #41341
result = get_result()
# In this case we have list-of-list, will raise TypeError,
# and subsequently be dropped as nuisance columns
if op == "sum":
expected = df.set_index(keys)[["C"]]
else:
expected = df.set_index(keys)[[]]
expected = df.set_index(keys)[["C"]]
tm.assert_equal(result, expected)
return

if (
op in ["min", "max", "skew"]
and isinstance(values, Categorical)
and len(keys) == 1
if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or (
op == "skew" and df.dtypes[0].kind == "M"
):
if op in ("min", "max"):
with pytest.raises(TypeError, match="Categorical is not ordered"):
if op == "skew" or len(keys) == 1:
msg = "|".join(
[
"Categorical is not ordered",
"does not support reduction",
]
)
with pytest.raises(TypeError, match=msg):
get_result()
return
# Categorical doesn't implement, so with numeric_only=True
Expand Down
Loading