From df082483165f8264ae71848d9359dea0779fc59e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 4 Jan 2023 15:41:26 -0800 Subject: [PATCH 1/6] DEPR: Remove silent dropping of nuisance columns in window ops --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/window/rolling.py | 25 +++--------------- pandas/tests/window/test_api.py | 15 ++++------- pandas/tests/window/test_ewm.py | 8 +++--- pandas/tests/window/test_groupby.py | 41 +++++++---------------------- pandas/tests/window/test_numba.py | 22 ++++++---------- 6 files changed, 31 insertions(+), 81 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ea6a832d25058..edce0ff4ce7c9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -728,6 +728,7 @@ Removal of prior version deprecations/changes - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) +- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops (:issue:`42834`) - Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`) - Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`) - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 989b82f45339f..12249c2b40065 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -18,7 +18,6 @@ Sized, cast, ) -import warnings import numpy as np @@ -37,7 +36,6 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_float64, @@ -484,13 +482,10 @@ def hfunc(values: ArrayLike) -> ArrayLike: res_values = [] for i, arr in enumerate(obj._iter_column_arrays()): # GH#42736 operate column-wise instead of block-wise - try: - res = hfunc(arr) - except (TypeError, NotImplementedError): - pass - else: - res_values.append(res) - taker.append(i) + # As of 2.0, hfunc will raise for nuisance columns + res = hfunc(arr) + res_values.append(res) + taker.append(i) index = self._slice_axis_for_step( obj.index, res_values[0] if len(res_values) > 0 else None @@ -505,18 +500,6 @@ def hfunc(values: ArrayLike) -> ArrayLike: if self.axis == 1: df = df.T - if 0 != len(res_values) != len(obj.columns): - # GH#42738 ignore_failures dropped nuisance columns - dropped = obj.columns.difference(obj.columns.take(taker)) - warnings.warn( - "Dropping of nuisance columns in rolling operations " - "is deprecated; in a future version this will raise TypeError. " - "Select only valid columns before calling the operation. " - f"Dropped columns were {dropped}", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._resolve_output(df, obj) def _apply_tablewise( diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index bc723b8ed36b8..66932e9308d28 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -66,18 +66,13 @@ def tests_skip_nuisance(step): tm.assert_frame_equal(result, expected) -def test_skip_sum_object_raises(step): +def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) - msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#42738 - result = r.sum() - expected = DataFrame( - {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, - columns=list("AB"), - )[::step] - tm.assert_frame_equal(result, expected) + msg = r"cannot handle this type -> object" + with pytest.raises(TypeError, match=msg): + # GH#42738, enforced in 2.0 + r.sum() def test_agg(step): diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index f88c20f2f78c6..205a02dcb051b 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -98,11 +98,9 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): halflife = halflife_with_times data = np.arange(10.0) data[::2] = np.nan - df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) - with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): - # GH#42738 - result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() - expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() + df = DataFrame({"A": data}) + result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() + expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 3da14bce6facd..fbcff5f709a69 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1125,11 +1125,10 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - expected = df.groupby("A", group_keys=True).apply( - lambda x: getattr(x.ewm(com=1.0), method)() - ) + # GH#42738 + expected = df.groupby("A", group_keys=True).apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -1160,13 +1159,9 @@ def test_pairwise_methods(self, method, expected_data): def test_times(self, times_frame): # GH 40951 halflife = "23 days" - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"]) - .mean() - ) + # GH#42738 + times = times_frame.pop("C") + result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean() expected = DataFrame( { "B": [ @@ -1200,29 +1195,13 @@ def test_times(self, times_frame): ) tm.assert_frame_equal(result, expected) - def test_times_vs_apply(self, times_frame): - # GH 40951 - halflife = "23 days" - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"]) - .mean() - ) - expected = times_frame.groupby("A", group_keys=True).apply( - lambda x: x.ewm(halflife=halflife, times=x["C"]).mean() - ) - tm.assert_frame_equal(result, expected) - def test_times_array(self, times_frame): # GH 40951 halflife = "23 days" + times = times_frame.pop("C") gb = times_frame.groupby("A") - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = gb.ewm(halflife=halflife, times=times_frame["C"]).mean() - expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean() + result = gb.ewm(halflife=halflife, times=times).mean() + expected = gb.ewm(halflife=halflife, times=times.values).mean() tm.assert_frame_equal(result, expected) def test_dont_mutate_obj_after_slicing(self): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 1c78a186e9d37..cca0ab3a0a9bb 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -253,22 +253,19 @@ def test_invalid_engine_kwargs(self, grouper, method): def test_cython_vs_numba( self, grouper, method, nogil, parallel, nopython, ignore_na, adjust ): + df = DataFrame({"B": range(4)}) if grouper == "None": grouper = lambda x: x - warn = FutureWarning else: + df["A"] = ["a", "b", "a", "b"] grouper = lambda x: x.groupby("A") - warn = None if method == "sum": adjust = True - df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - with tm.assert_produces_warning(warn, match="nuisance"): - # GH#42738 - result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs) - expected = getattr(ewm, method)(engine="cython") + result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs) + expected = getattr(ewm, method)(engine="cython") tm.assert_frame_equal(result, expected) @@ -276,12 +273,12 @@ def test_cython_vs_numba( def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 + df = DataFrame({"B": [0, 0, 1, 1, 2, 2]}) if grouper == "None": grouper = lambda x: x - warn = FutureWarning else: grouper = lambda x: x.groupby("A") - warn = None + df["A"] = ["a", "b", "a", "b", "b", "a"] halflife = "23 days" times = to_datetime( @@ -294,17 +291,14 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ "2020-01-03", ] ) - df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]}) ewm = grouper(df).ewm( halflife=halflife, adjust=True, ignore_na=ignore_na, times=times ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - with tm.assert_produces_warning(warn, match="nuisance"): - # GH#42738 - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) From 3db24ffcbbce85be5425d1a723fbd85b1bb042bf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Jan 2023 09:57:00 -0800 Subject: [PATCH 2/6] Align exception with series and fix test --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/window/rolling.py | 12 +++++++----- pandas/tests/window/test_api.py | 8 +++++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index edce0ff4ce7c9..1ae88ecc65274 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -728,7 +728,7 @@ Removal of prior version deprecations/changes - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) -- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops (:issue:`42834`) +- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. This will now raise a :class:`.errors.DataError` (:issue:`42834`) - Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`) - Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`) - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 12249c2b40065..ef0524e48f9e2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -471,10 +471,6 @@ def _apply_blockwise( obj = notna(obj).astype(int) obj._mgr = obj._mgr.consolidate() - def hfunc(values: ArrayLike) -> ArrayLike: - values = self._prep_values(values) - return homogeneous_func(values) - if self.axis == 1: obj = obj.T @@ -483,7 +479,13 @@ def hfunc(values: ArrayLike) -> ArrayLike: for i, arr in enumerate(obj._iter_column_arrays()): # GH#42736 operate column-wise instead of block-wise # As of 2.0, hfunc will raise for nuisance columns - res = hfunc(arr) + try: + arr = self._prep_values(arr) + except (TypeError, NotImplementedError) as err: + raise DataError( + f"Cannot aggregate non-numeric type: {arr.dtype}" + ) from err + res = homogeneous_func(arr) res_values.append(res) taker.append(i) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 66932e9308d28..6180d4a5f8e17 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas.errors import SpecificationError +from pandas.errors import ( + DataError, + SpecificationError, +) from pandas import ( DataFrame, @@ -69,8 +72,7 @@ def tests_skip_nuisance(step): def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) - msg = r"cannot handle this type -> object" - with pytest.raises(TypeError, match=msg): + with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"): # GH#42738, enforced in 2.0 r.sum() From 2372a6dff41417331ca96e4fe6945ac912988002 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Jan 2023 11:57:27 -0800 Subject: [PATCH 3/6] Fix message --- pandas/tests/window/test_dtypes.py | 2 +- pandas/tests/window/test_groupby.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 52011a2d5f760..b975a28273337 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -165,7 +165,7 @@ def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step): rolled = df.rolling(2, min_periods=min_periods, step=step) if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count": - msg = "No numeric types to aggregate" + msg = "Cannot aggregate non-numeric type" with pytest.raises(DataError, match=msg): getattr(rolled, method)() else: diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index fbcff5f709a69..41b2ee70d7987 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1125,12 +1125,6 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - # GH#42738 - expected = df.groupby("A", group_keys=True).apply( - lambda x: getattr(x.ewm(com=1.0), method)() - ) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], From 8e3e6b9d89b74dc5668a742cbe13123b996bd41d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Jan 2023 13:02:30 -0800 Subject: [PATCH 4/6] Fix asv --- asv_bench/benchmarks/rolling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index d65a1a39e8bc7..0eca597c1bc66 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -307,6 +307,9 @@ def setup(self, method, window_kwargs): "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) + if "on" in kwargs: + key = kwargs.pop("on") + df = df.set_index(key) self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) def time_method(self, method, window_kwargs): From ab5b403ccdbcf4ff6cdd5f8ef870658ea3bfa7cd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Jan 2023 13:45:44 -0800 Subject: [PATCH 5/6] fix asv again --- asv_bench/benchmarks/rolling.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 0eca597c1bc66..84e12199dc289 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -292,7 +292,7 @@ class Groupby: ["sum", "median", "mean", "max", "min", "kurt", "sum"], [ ("rolling", {"window": 2}), - ("rolling", {"window": "30s", "on": "C"}), + ("rolling", {"window": "30s"}), ("expanding", {}), ], ) @@ -304,12 +304,10 @@ def setup(self, method, window_kwargs): { "A": [str(i) for i in range(N)] * 10, "B": list(range(N)) * 10, - "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) - if "on" in kwargs: - key = kwargs.pop("on") - df = df.set_index(key) + if isinstance(kwargs.get("window", None), str): + df.index = (pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),) self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) def time_method(self, method, window_kwargs): From b50f4405b0d58f8bfe630e3e4e19d055bca4cf2b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Jan 2023 13:45:54 -0800 Subject: [PATCH 6/6] fix asv again --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 84e12199dc289..7e94763f3f293 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -307,7 +307,7 @@ def setup(self, method, window_kwargs): } ) if isinstance(kwargs.get("window", None), str): - df.index = (pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),) + df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10) self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) def time_method(self, method, window_kwargs):