From e87c40c0a8b54f8771853bf3e4ea8afbe2daf0c7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 24 Nov 2022 11:58:42 +0000 Subject: [PATCH 1/2] DEPR: Change numeric_only to False in various groupby ops --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 40 ++++---- .../tests/groupby/aggregate/test_aggregate.py | 10 +- pandas/tests/groupby/test_function.py | 31 +++--- pandas/tests/groupby/test_groupby.py | 95 ++++++------------- .../tests/groupby/transform/test_transform.py | 5 +- pandas/tests/resample/test_resample_api.py | 1 + 8 files changed, 74 insertions(+), 112 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index fd7cc0598f850..11b80b1c0e111 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -572,7 +572,7 @@ Removal of prior version deprecations/changes - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) -- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`) +- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a9ef1fca7e8e7..fa2e2c9c295d6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2219,7 +2219,7 @@ def skew( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, **kwargs, ) -> DataFrame: result = self._op_via_apply( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 733cf7f262f46..b3f6bb3edb9da 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2136,7 +2136,7 @@ def mean( @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): + def median(self, numeric_only: bool = False): """ Compute median of groups, excluding missing values. @@ -2173,7 +2173,7 @@ def std( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Compute standard deviation of groups, excluding missing values. @@ -2202,11 +2202,15 @@ def std( .. versionadded:: 1.4.0 - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -2236,7 +2240,6 @@ def std( post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) - self._maybe_warn_numeric_only_depr("std", result, numeric_only) return result @final @@ -2247,7 +2250,7 @@ def var( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Compute variance of groups, excluding missing values. @@ -2276,11 +2279,15 @@ def var( .. versionadded:: 1.4.0 - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -2301,7 +2308,7 @@ def var( @final @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default): + def sem(self, ddof: int = 1, numeric_only: bool = False): """ Compute standard error of the mean of groups, excluding missing values. @@ -2317,23 +2324,22 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame Standard error of the mean of values within each group. """ # Reolve numeric_only so that std doesn't warn - numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0) - if ( - numeric_only_bool - and self.obj.ndim == 1 - and not is_numeric_dtype(self.obj.dtype) - ): + if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): raise TypeError( f"{type(self).__name__}.sem called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) - result = self.std(ddof=ddof, numeric_only=numeric_only_bool) + result = self.std(ddof=ddof, numeric_only=numeric_only) self._maybe_warn_numeric_only_depr("sem", result, numeric_only) if result.ndim == 1: @@ -2411,10 +2417,8 @@ def sum( return self._reindex_output(result, fill_value=0) @final - @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) - def prod( - self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 - ): + @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0) + def prod(self, numeric_only: bool = False, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8ffc49cd25915..2d3ff95504371 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -239,10 +239,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - warn = FutureWarning if func == "std" else None - msg = "The default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby(level=1, axis=1).agg(func) + result = df.groupby(level=1, axis=1).agg(func) expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( result_dtype_dict ) @@ -266,10 +263,7 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): columns=Index([10, 20, 10, 20], name="x"), dtype="int64", ).astype({10: "Int64"}) - warn = FutureWarning if func == "std" else None - msg = "The default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby("x", axis=1).agg(func) + result = df.groupby("x", axis=1).agg(func) expected = DataFrame( data=expected_data, index=Index([0, 1, 0], name="y"), diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e41665680e42d..0f301e05dc898 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -169,12 +169,9 @@ def test_averages(self, df, method): ], ) - if method == "mean": - with pytest.raises(TypeError, match="[Cc]ould not convert"): - getattr(gb, method)() - result = getattr(gb, method)(numeric_only=True) - else: - result = getattr(gb, method)() + with pytest.raises(TypeError, match="[Cc]ould not convert"): + getattr(gb, method)() + result = getattr(gb, method)(numeric_only=True) tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -276,11 +273,12 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): ) with pytest.raises(exception, match=msg): getattr(gb, method)() - elif method in ("sum", "mean"): + elif method in ("sum", "mean", "median", "prod"): msg = "|".join( [ "category type does not support sum operations", - "Could not convert", + "[Cc]ould not convert", + "can't multiply sequence by non-int of type 'str'", ] ) with pytest.raises(exception, match=msg): @@ -1397,18 +1395,18 @@ def test_groupby_sum_timedelta_with_nat(): ("last", False, True), ("max", False, True), ("mean", False, True), - ("median", True, True), + ("median", False, True), ("min", False, True), ("nth", False, False), ("nunique", False, False), ("pct_change", False, False), - ("prod", True, True), + ("prod", False, True), ("quantile", True, True), - ("sem", True, True), - ("skew", True, True), - ("std", True, True), + ("sem", False, True), + ("skew", False, True), + ("std", False, True), ("sum", False, True), - ("var", True, True), + ("var", False, True), ], ) @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) @@ -1592,6 +1590,11 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): warn_msg = "" err_category = TypeError err_msg = "Series.skew does not allow numeric_only=True with non-numeric" + elif groupby_func == "sem": + warn_category = None + warn_msg = "" + err_category = TypeError + err_msg = "called with numeric_only=True and dtype object" else: warn_category = FutureWarning warn_msg = "This will raise a TypeError" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f8ee17fe80f27..1686f7c9081b7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas._libs import lib from pandas.compat import IS64 from pandas.errors import ( PerformanceWarning, @@ -909,19 +908,13 @@ def test_keep_nuisance_agg(df, agg_function): "agg_function", ["sum", "mean", "prod", "std", "var", "sem", "median"], ) -@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) +@pytest.mark.parametrize("numeric_only", [True, False]) def test_omit_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 - if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"): - # sum doesn't drop strings - warn = FutureWarning - else: - warn = None - grouped = df.groupby("A") no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") - if agg_function in no_drop_nuisance and numeric_only is False: + if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False klass = ValueError if agg_function in ("std", "sem") else TypeError @@ -929,44 +922,23 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: - if numeric_only is lib.no_default: - msg = ( - f"The default value of numeric_only in DataFrameGroupBy.{agg_function}" - ) - else: - msg = "Dropping invalid columns" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(grouped, agg_function)(numeric_only=numeric_only) - if ( - (numeric_only is lib.no_default or not numeric_only) - # These methods drop non-numeric columns even when numeric_only is False - and agg_function not in ("mean", "prod", "median") - ): + result = getattr(grouped, agg_function)(numeric_only=numeric_only) + if not numeric_only and agg_function == "sum": + # sum is successful on column B columns = ["A", "B", "C", "D"] else: columns = ["A", "C", "D"] - if agg_function == "sum" and numeric_only is False: - # sum doesn't drop nuisance string columns - warn = None - elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True: - warn = FutureWarning - else: - warn = None - msg = "The default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): - expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( - numeric_only=numeric_only - ) + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) tm.assert_frame_equal(result, expected) -def test_omit_nuisance_warnings(df): +def test_raise_on_nuisance_python_single(df): # GH 38815 - with tm.assert_produces_warning(FutureWarning, filter_level="always"): - grouped = df.groupby("A") - result = grouped.skew() - expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew() - tm.assert_frame_equal(result, expected) + grouped = df.groupby("A") + with pytest.raises(TypeError, match="could not convert"): + grouped.skew() def test_raise_on_nuisance_python_multiple(three_group): @@ -2012,14 +1984,9 @@ def get_result(**kwargs): if df.dtypes[0].kind == "M": # GH#41291 # datetime64 -> prod and sum are invalid - if op == "sum": - with pytest.raises( - TypeError, match="datetime64 type does not support" - ): - get_result() - result = get_result(numeric_only=True) - else: - result = get_result() + with pytest.raises(TypeError, match="datetime64 type does not support"): + get_result() + result = get_result(numeric_only=True) # with numeric_only=True, these are dropped, and we get # an empty DataFrame back @@ -2030,14 +1997,9 @@ def get_result(**kwargs): elif isinstance(values, Categorical): # GH#41291 # Categorical doesn't implement sum or prod - if op == "sum": - with pytest.raises( - TypeError, match="category type does not support" - ): - get_result() - result = get_result(numeric_only=True) - else: - result = get_result() + with pytest.raises(TypeError, match="category type does not support"): + get_result() + result = get_result(numeric_only=True) # with numeric_only=True, these are dropped, and we get # an empty DataFrame back @@ -2057,20 +2019,21 @@ def get_result(**kwargs): result = get_result() # In this case we have list-of-list, will raise TypeError, # and subsequently be dropped as nuisance columns - if op == "sum": - expected = df.set_index(keys)[["C"]] - else: - expected = df.set_index(keys)[[]] + expected = df.set_index(keys)[["C"]] tm.assert_equal(result, expected) return - if ( - op in ["min", "max", "skew"] - and isinstance(values, Categorical) - and len(keys) == 1 + if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or ( + op == "skew" and df.dtypes[0].kind == "M" ): - if op in ("min", "max"): - with pytest.raises(TypeError, match="Categorical is not ordered"): + if op == "skew" or len(keys) == 1: + msg = "|".join( + [ + "Categorical is not ordered", + "does not support reduction", + ] + ) + with pytest.raises(TypeError, match=msg): get_result() return # Categorical doesn't implement, so with numeric_only=True diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index beb891f9c0b11..23005f291970b 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -193,12 +193,9 @@ def test_transform_axis_1_reducer(request, reduction_func): ): marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") request.node.add_marker(marker) - warn = FutureWarning if reduction_func in ("sem", "std") else None - msg = "The default value of numeric_only" df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) + result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T tm.assert_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index ca5444fd4e62f..50382c70503fc 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -871,6 +871,7 @@ def test_frame_downsample_method(method, numeric_only, expected_data): "first", "last", "prod", + "std", ): warn = FutureWarning msg = ( From 8a507e541c135f1d4fe029c8fdc87f7892ada58d Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 25 Nov 2022 10:16:01 -0500 Subject: [PATCH 2/2] Remove FIXME --- pandas/tests/groupby/test_groupby.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1686f7c9081b7..c35930ed43607 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2015,10 +2015,7 @@ def get_result(**kwargs): return elif df.dtypes[0] == object: - # FIXME: the test is actually wrong here, xref #41341 result = get_result() - # In this case we have list-of-list, will raise TypeError, - # and subsequently be dropped as nuisance columns expected = df.set_index(keys)[["C"]] tm.assert_equal(result, expected) return