From 1ae124cd8d25525764f3d7a38cf4ea9ddf9dd224 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 10:08:22 -0500 Subject: [PATCH 1/3] DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/generic.py | 4 +- pandas/core/groupby/groupby.py | 7 +- pandas/tests/groupby/test_function.py | 155 +++++++++++---------- pandas/tests/groupby/test_groupby.py | 13 +- pandas/tests/groupby/test_min_max.py | 10 +- pandas/tests/resample/test_resample_api.py | 15 +- 7 files changed, 115 insertions(+), 90 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 032bcf09244e5..2ddd9c705f21d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -557,6 +557,7 @@ Removal of prior version deprecations/changes - Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`) - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) +- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 571559dc838f5..32802f6429c4c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1357,7 +1357,9 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) + res_mgr = mgr.grouped_reduce( + arr_func, ignore_failures=numeric_only is lib.no_default + ) res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d10931586d5e0..f16a8cce0863b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1664,6 +1664,7 @@ def _agg_general( alt=npfunc, numeric_only=numeric_only, min_count=min_count, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2132,6 +2133,7 @@ def mean( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), numeric_only=numeric_only, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -2161,6 +2163,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): "median", alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), numeric_only=numeric_only, + ignore_failures=numeric_only is lib.no_default, ) return result.__finalize__(self.obj, method="groupby") @@ -3761,7 +3764,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: if numeric_only_bool: mgr = mgr.get_numeric_data() - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) + res_mgr = mgr.grouped_reduce( + blk_func, ignore_failures=numeric_only is lib.no_default + ) if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f05874c3286c7..8e6122a66de09 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -163,14 +163,12 @@ def test_averages(self, df, method): "int", "float", "category_int", - "datetime", - "datetimetz", - "timedelta", ], ) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = getattr(gb, method)(numeric_only=False) + with pytest.raises(TypeError, match="[Cc]ould not convert"): + getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -252,30 +250,37 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - # cummin, cummax dont have numeric_only kwarg, always use False - warn = None - if method in ["cummin", "cummax"]: - # these dont have numeric_only kwarg, always use False - warn = FutureWarning - elif method in ["min", "max"]: - # these have numeric_only kwarg, but default to False - warn = FutureWarning - - with tm.assert_produces_warning( - warn, match="Dropping invalid columns", raise_on_extra_warnings=False - ): + if method in ("min", "max"): + # The methods default to numeric_only=False and raise TypeError + with pytest.raises(TypeError, match="Categorical is not ordered"): + getattr(gb, method)() + elif method in ("cummin", "cummax"): + # The methods default to numeric_only=False and raise NotImplementedError + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + getattr(gb, method)() + else: result = getattr(gb, method)() - - tm.assert_index_equal(result.columns, expected_columns_numeric) - - # GH#41475 deprecated silently ignoring nuisance columns - warn = None - if len(expected_columns) < len(gb._obj_with_exclusions.columns): - warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + tm.assert_index_equal(result.columns, expected_columns_numeric) + + if method in ("cumsum", "cumprod", "cummin", "cummax"): + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + getattr(gb, method)(numeric_only=False) + elif method not in ("first", "last"): + msg = "|".join( + [ + "[Cc]ould not convert", + "Categorical is not ordered", + "category type does not support", + "can't multiply sequence", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(gb, method)(numeric_only=False) + else: result = getattr(gb, method)(numeric_only=False) - - tm.assert_index_equal(result.columns, expected_columns) + tm.assert_index_equal(result.columns, expected_columns) class TestGroupByNonCythonPaths: @@ -1323,45 +1328,45 @@ def test_groupby_sum_timedelta_with_nat(): @pytest.mark.parametrize( - "kernel, numeric_only_default, drops_nuisance, has_arg", + "kernel, numeric_only_default, has_arg", [ - ("all", False, False, False), - ("any", False, False, False), - ("bfill", False, False, False), - ("corr", True, False, True), - ("corrwith", True, False, True), - ("cov", True, False, True), - ("cummax", False, True, True), - ("cummin", False, True, True), - ("cumprod", True, True, True), - ("cumsum", True, True, True), - ("diff", False, False, False), - ("ffill", False, False, False), - ("fillna", False, False, False), - ("first", False, False, True), - ("idxmax", True, False, True), - ("idxmin", True, False, True), - ("last", False, False, True), - ("max", False, True, True), - ("mean", True, True, True), - ("median", True, True, True), - ("min", False, True, True), - ("nth", False, False, False), - ("nunique", False, False, False), - ("pct_change", False, False, False), - ("prod", True, True, True), - ("quantile", True, False, True), - ("sem", True, True, True), - ("skew", True, False, True), - ("std", True, True, True), - ("sum", True, True, True), - ("var", True, False, True), + ("all", False, False), + ("any", False, False), + ("bfill", False, False), + ("corr", True, True), + ("corrwith", True, True), + ("cov", True, True), + ("cummax", False, True), + ("cummin", False, True), + ("cumprod", True, True), + ("cumsum", True, True), + ("diff", False, False), + ("ffill", False, False), + ("fillna", False, False), + ("first", False, True), + ("idxmax", True, True), + ("idxmin", True, True), + ("last", False, True), + ("max", False, True), + ("mean", True, True), + ("median", True, True), + ("min", False, True), + ("nth", False, False), + ("nunique", False, False), + ("pct_change", False, False), + ("prod", True, True), + ("quantile", True, True), + ("sem", True, True), + ("skew", True, True), + ("std", True, True), + ("sum", True, True), + ("var", True, True), ], ) @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_deprecate_numeric_only( - kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys + kernel, numeric_only_default, has_arg, numeric_only, keys ): # GH#46072 # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False @@ -1380,10 +1385,9 @@ def test_deprecate_numeric_only( # Cases where b does not appear in the result numeric_only is True or (numeric_only is lib.no_default and numeric_only_default) - or drops_nuisance ) ): - if numeric_only is True or (not numeric_only_default and not drops_nuisance): + if numeric_only is True or not numeric_only_default: warn = None else: warn = FutureWarning @@ -1408,17 +1412,24 @@ def test_deprecate_numeric_only( assert "b" in result.columns elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True - assert not drops_nuisance # kernels that are successful on any dtype were above; this will fail - msg = ( - "(not allowed for this dtype" - "|must be a string or a number" - "|cannot be performed against 'object' dtypes" - "|must be a string or a real number" - "|unsupported operand type)" - ) - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) + if kernel in ("cummax", "cummin", "cumprod", "cumsum"): + msg = "function is not implemented for this dtype" + with pytest.raises(NotImplementedError, match=msg): + method(*args, **kwargs) + else: + msg = "|".join( + [ + "not allowed for this dtype", + "must be a string or a number", + "cannot be performed against 'object' dtypes", + "must be a string or a real number", + "unsupported operand type", + "not supported between instances of", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7fd52d3cf5bb8..96be7a0cb785c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -915,11 +915,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): grouped = df.groupby("A") - if agg_function in ("var", "std", "sem") and numeric_only is False: + no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") + if agg_function in no_drop_nuisance and numeric_only is False: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - klass = TypeError if agg_function == "var" else ValueError - with pytest.raises(klass, match="could not convert string to float"): + klass = ValueError if agg_function in ("std", "sem") else TypeError + msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) + with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: if numeric_only is lib.no_default: @@ -2049,10 +2051,13 @@ def get_result(): and isinstance(values, Categorical) and len(keys) == 1 ): + if op in ("min", "max"): + with pytest.raises(TypeError, match="Categorical is not ordered"): + get_result() + return # Categorical doesn't implement, so with numeric_only=True # these are dropped and we get an empty DataFrame back result = get_result() - expected = df.set_index(keys)[[]] # with numeric_only=True, these are dropped, and we get # an empty DataFrame back diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index b26ee057d2041..72772775b3fa1 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -48,15 +48,17 @@ def test_max_min_object_multiple_columns(using_array_manager): gb = df.groupby("A") - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.max(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances"): + gb.max(numeric_only=False) + result = gb[["C"]].max() # "max" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - result = gb.min(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances"): + gb.max(numeric_only=False) + result = gb[["C"]].min() # "min" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5721de9e5f3bb..ca5444fd4e62f 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -821,8 +821,8 @@ def test_end_and_end_day_origin( ("sum", False, {"cat": ["cat_1cat_2"], "num": [25]}), ("sum", lib.no_default, {"num": [25]}), ("prod", True, {"num": [100]}), - ("prod", False, {"num": [100]}), - ("prod", lib.no_default, {"num": [100]}), + ("prod", False, "can't multiply sequence"), + ("prod", lib.no_default, "can't multiply sequence"), ("min", True, {"num": [5]}), ("min", False, {"cat": ["cat_1"], "num": [5]}), ("min", lib.no_default, {"cat": ["cat_1"], "num": [5]}), @@ -836,10 +836,10 @@ def test_end_and_end_day_origin( ("last", False, {"cat": ["cat_2"], "num": [20]}), ("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}), ("mean", True, {"num": [12.5]}), - ("mean", False, {"num": [12.5]}), + ("mean", False, "Could not convert"), ("mean", lib.no_default, {"num": [12.5]}), ("median", True, {"num": [12.5]}), - ("median", False, {"num": [12.5]}), + ("median", False, "could not convert"), ("median", lib.no_default, {"num": [12.5]}), ("std", True, {"num": [10.606601717798213]}), ("std", False, "could not convert string to float"), @@ -876,15 +876,14 @@ def test_frame_downsample_method(method, numeric_only, expected_data): msg = ( f"default value of numeric_only in DataFrameGroupBy.{method} is deprecated" ) - elif method in ("prod", "mean", "median") and numeric_only is not True: - warn = FutureWarning - msg = f"Dropping invalid columns in DataFrameGroupBy.{method} is deprecated" else: warn = None msg = "" with tm.assert_produces_warning(warn, match=msg): if isinstance(expected_data, str): - klass = TypeError if method == "var" else ValueError + klass = ( + TypeError if method in ("var", "mean", "median", "prod") else ValueError + ) with pytest.raises(klass, match=expected_data): _ = func(**kwargs) else: From cbd87eb3e5eccff17e341d9f3ac92ba65bf6edf6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 10:40:03 -0500 Subject: [PATCH 2/3] Change to TypeError --- pandas/core/groupby/generic.py | 10 ++++-- pandas/tests/groupby/test_function.py | 52 ++++++++++++--------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 32802f6429c4c..97cdefe64ba5e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1357,9 +1357,13 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce( - arr_func, ignore_failures=numeric_only is lib.no_default - ) + try: + res_mgr = mgr.grouped_reduce( + arr_func, ignore_failures=numeric_only is lib.no_default + ) + except NotImplementedError as err: + msg = f"{how} is not supported for at least one provided dtype" + raise TypeError(msg) from err res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 8e6122a66de09..69c839c3b2273 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,30 +250,28 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - if method in ("min", "max"): + if method in ("min", "max", "cummin", "cummax"): # The methods default to numeric_only=False and raise TypeError - with pytest.raises(TypeError, match="Categorical is not ordered"): - getattr(gb, method)() - elif method in ("cummin", "cummax"): - # The methods default to numeric_only=False and raise NotImplementedError - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): + msg = "|".join( + [ + "Categorical is not ordered", + "is not supported for at least one provided dtype", + ] + ) + with pytest.raises(TypeError, match=msg): getattr(gb, method)() else: result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) - if method in ("cumsum", "cumprod", "cummin", "cummax"): - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): - getattr(gb, method)(numeric_only=False) - elif method not in ("first", "last"): + if method not in ("first", "last"): msg = "|".join( [ "[Cc]ould not convert", "Categorical is not ordered", "category type does not support", "can't multiply sequence", + "is not supported for at least one provided dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -1413,23 +1411,19 @@ def test_deprecate_numeric_only( elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True # kernels that are successful on any dtype were above; this will fail - if kernel in ("cummax", "cummin", "cumprod", "cumsum"): - msg = "function is not implemented for this dtype" - with pytest.raises(NotImplementedError, match=msg): - method(*args, **kwargs) - else: - msg = "|".join( - [ - "not allowed for this dtype", - "must be a string or a number", - "cannot be performed against 'object' dtypes", - "must be a string or a real number", - "unsupported operand type", - "not supported between instances of", - ] - ) - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) + msg = "|".join( + [ + "not allowed for this dtype", + "must be a string or a number", + "cannot be performed against 'object' dtypes", + "must be a string or a real number", + "unsupported operand type", + "not supported between instances of", + "is not supported for at least one provided dtype", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" From 4179204aacf76c59424ee1d34f6ca012845beffe Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 12 Nov 2022 12:56:42 -0500 Subject: [PATCH 3/3] Better error message --- pandas/core/groupby/generic.py | 4 ++-- pandas/tests/groupby/test_function.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 97cdefe64ba5e..4ac701952e258 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1362,8 +1362,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: arr_func, ignore_failures=numeric_only is lib.no_default ) except NotImplementedError as err: - msg = f"{how} is not supported for at least one provided dtype" - raise TypeError(msg) from err + # For NotImplementedError, args[0] is the error message + raise TypeError(err.args[0]) from err res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < orig_mgr_len: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 69c839c3b2273..4adf2d5f38ead 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -255,7 +255,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): msg = "|".join( [ "Categorical is not ordered", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -271,7 +271,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "Categorical is not ordered", "category type does not support", "can't multiply sequence", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg): @@ -1419,7 +1419,7 @@ def test_deprecate_numeric_only( "must be a string or a real number", "unsupported operand type", "not supported between instances of", - "is not supported for at least one provided dtype", + "function is not implemented for this dtype", ] ) with pytest.raises(TypeError, match=msg):