DEPR: Remove silent dropping of nuisance columns in window ops (#50576)

mroeschke · web-flow · commit 548359043697 · 2023-01-05T16:39:10.000-08:00
* DEPR: Remove silent dropping of nuisance columns in window ops

* Align exception with series and fix test

* Fix message

* Fix asv

* fix asv again

* fix asv again
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -292,7 +292,7 @@ class Groupby:
         ["sum", "median", "mean", "max", "min", "kurt", "sum"],
         [
             ("rolling", {"window": 2}),
-            ("rolling", {"window": "30s", "on": "C"}),
+            ("rolling", {"window": "30s"}),
             ("expanding", {}),
         ],
     )
@@ -304,9 +304,10 @@ def setup(self, method, window_kwargs):
             {
                 "A": [str(i) for i in range(N)] * 10,
                 "B": list(range(N)) * 10,
-                "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),
             }
         )
+        if isinstance(kwargs.get("window", None), str):
+            df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10)
         self.groupby_window = getattr(df.groupby("A"), window)(**kwargs)
 
     def time_method(self, method, window_kwargs):
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -729,6 +729,7 @@ Removal of prior version deprecations/changes
 - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
 - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
 - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
+- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. This will now raise a :class:`.errors.DataError` (:issue:`42834`)
 - Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`)
 - Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`)
 - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -18,7 +18,6 @@
     Sized,
     cast,
 )
-import warnings
 
 import numpy as np
 
@@ -37,7 +36,6 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import DataError
 from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
     ensure_float64,
@@ -473,24 +471,23 @@ def _apply_blockwise(
             obj = notna(obj).astype(int)
             obj._mgr = obj._mgr.consolidate()
 
-        def hfunc(values: ArrayLike) -> ArrayLike:
-            values = self._prep_values(values)
-            return homogeneous_func(values)
-
         if self.axis == 1:
             obj = obj.T
 
         taker = []
         res_values = []
         for i, arr in enumerate(obj._iter_column_arrays()):
             # GH#42736 operate column-wise instead of block-wise
+            # As of 2.0, hfunc will raise for nuisance columns
             try:
-                res = hfunc(arr)
-            except (TypeError, NotImplementedError):
-                pass
-            else:
-                res_values.append(res)
-                taker.append(i)
+                arr = self._prep_values(arr)
+            except (TypeError, NotImplementedError) as err:
+                raise DataError(
+                    f"Cannot aggregate non-numeric type: {arr.dtype}"
+                ) from err
+            res = homogeneous_func(arr)
+            res_values.append(res)
+            taker.append(i)
 
         index = self._slice_axis_for_step(
             obj.index, res_values[0] if len(res_values) > 0 else None
@@ -505,18 +502,6 @@ def hfunc(values: ArrayLike) -> ArrayLike:
         if self.axis == 1:
             df = df.T
 
-        if 0 != len(res_values) != len(obj.columns):
-            # GH#42738 ignore_failures dropped nuisance columns
-            dropped = obj.columns.difference(obj.columns.take(taker))
-            warnings.warn(
-                "Dropping of nuisance columns in rolling operations "
-                "is deprecated; in a future version this will raise TypeError. "
-                "Select only valid columns before calling the operation. "
-                f"Dropped columns were {dropped}",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-
         return self._resolve_output(df, obj)
 
     def _apply_tablewise(
diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py
@@ -1,7 +1,10 @@
 import numpy as np
 import pytest
 
-from pandas.errors import SpecificationError
+from pandas.errors import (
+    DataError,
+    SpecificationError,
+)
 
 from pandas import (
     DataFrame,
@@ -66,18 +69,12 @@ def tests_skip_nuisance(step):
     tm.assert_frame_equal(result, expected)
 
 
-def test_skip_sum_object_raises(step):
+def test_sum_object_str_raises(step):
     df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"})
     r = df.rolling(window=3, step=step)
-    msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        # GH#42738
-        result = r.sum()
-    expected = DataFrame(
-        {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]},
-        columns=list("AB"),
-    )[::step]
-    tm.assert_frame_equal(result, expected)
+    with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"):
+        # GH#42738, enforced in 2.0
+        r.sum()
 
 
 def test_agg(step):
diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py
@@ -165,7 +165,7 @@ def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step):
     rolled = df.rolling(2, min_periods=min_periods, step=step)
 
     if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count":
-        msg = "No numeric types to aggregate"
+        msg = "Cannot aggregate non-numeric type"
         with pytest.raises(DataError, match=msg):
             getattr(rolled, method)()
     else:
diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py
@@ -98,11 +98,9 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods):
     halflife = halflife_with_times
     data = np.arange(10.0)
     data[::2] = np.nan
-    df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)})
-    with tm.assert_produces_warning(FutureWarning, match="nuisance columns"):
-        # GH#42738
-        result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean()
-        expected = df.ewm(halflife=1.0, min_periods=min_periods).mean()
+    df = DataFrame({"A": data})
+    result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean()
+    expected = df.ewm(halflife=1.0, min_periods=min_periods).mean()
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -1125,13 +1125,6 @@ def test_methods(self, method, expected_data):
         )
         tm.assert_frame_equal(result, expected)
 
-        with tm.assert_produces_warning(FutureWarning, match="nuisance"):
-            # GH#42738
-            expected = df.groupby("A", group_keys=True).apply(
-                lambda x: getattr(x.ewm(com=1.0), method)()
-            )
-        tm.assert_frame_equal(result, expected)
-
     @pytest.mark.parametrize(
         "method, expected_data",
         [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]],
@@ -1160,13 +1153,9 @@ def test_pairwise_methods(self, method, expected_data):
     def test_times(self, times_frame):
         # GH 40951
         halflife = "23 days"
-        with tm.assert_produces_warning(FutureWarning, match="nuisance"):
-            # GH#42738
-            result = (
-                times_frame.groupby("A")
-                .ewm(halflife=halflife, times=times_frame["C"])
-                .mean()
-            )
+        # GH#42738
+        times = times_frame.pop("C")
+        result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean()
         expected = DataFrame(
             {
                 "B": [
@@ -1200,29 +1189,13 @@ def test_times(self, times_frame):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_times_vs_apply(self, times_frame):
-        # GH 40951
-        halflife = "23 days"
-        with tm.assert_produces_warning(FutureWarning, match="nuisance"):
-            # GH#42738
-            result = (
-                times_frame.groupby("A")
-                .ewm(halflife=halflife, times=times_frame["C"])
-                .mean()
-            )
-            expected = times_frame.groupby("A", group_keys=True).apply(
-                lambda x: x.ewm(halflife=halflife, times=x["C"]).mean()
-            )
-        tm.assert_frame_equal(result, expected)
-
     def test_times_array(self, times_frame):
         # GH 40951
         halflife = "23 days"
+        times = times_frame.pop("C")
         gb = times_frame.groupby("A")
-        with tm.assert_produces_warning(FutureWarning, match="nuisance"):
-            # GH#42738
-            result = gb.ewm(halflife=halflife, times=times_frame["C"]).mean()
-            expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean()
+        result = gb.ewm(halflife=halflife, times=times).mean()
+        expected = gb.ewm(halflife=halflife, times=times.values).mean()
         tm.assert_frame_equal(result, expected)
 
     def test_dont_mutate_obj_after_slicing(self):
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -253,35 +253,32 @@ def test_invalid_engine_kwargs(self, grouper, method):
     def test_cython_vs_numba(
         self, grouper, method, nogil, parallel, nopython, ignore_na, adjust
     ):
+        df = DataFrame({"B": range(4)})
         if grouper == "None":
             grouper = lambda x: x
-            warn = FutureWarning
         else:
+            df["A"] = ["a", "b", "a", "b"]
             grouper = lambda x: x.groupby("A")
-            warn = None
         if method == "sum":
             adjust = True
-        df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
         ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
-        with tm.assert_produces_warning(warn, match="nuisance"):
-            # GH#42738
-            result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs)
-            expected = getattr(ewm, method)(engine="cython")
+        result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs)
+        expected = getattr(ewm, method)(engine="cython")
 
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("grouper", ["None", "groupby"])
     def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na):
         # GH 40951
 
+        df = DataFrame({"B": [0, 0, 1, 1, 2, 2]})
         if grouper == "None":
             grouper = lambda x: x
-            warn = FutureWarning
         else:
             grouper = lambda x: x.groupby("A")
-            warn = None
+            df["A"] = ["a", "b", "a", "b", "b", "a"]
 
         halflife = "23 days"
         times = to_datetime(
@@ -294,17 +291,14 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_
                 "2020-01-03",
             ]
         )
-        df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]})
         ewm = grouper(df).ewm(
             halflife=halflife, adjust=True, ignore_na=ignore_na, times=times
         )
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
 
-        with tm.assert_produces_warning(warn, match="nuisance"):
-            # GH#42738
-            result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
-            expected = ewm.mean(engine="cython")
+        result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
+        expected = ewm.mean(engine="cython")
 
         tm.assert_frame_equal(result, expected)
 

Original file line number	Diff line number	Diff line change
`@@ -292,7 +292,7 @@ class Groupby:`
`292`	`292`	`["sum", "median", "mean", "max", "min", "kurt", "sum"],`
`293`	`293`	`[`
`294`	`294`	`("rolling", {"window": 2}),`
`295`		`- ("rolling", {"window": "30s", "on": "C"}),`
	`295`	`+ ("rolling", {"window": "30s"}),`
`296`	`296`	`("expanding", {}),`
`297`	`297`	`],`
`298`	`298`	`)`
`@@ -304,9 +304,10 @@ def setup(self, method, window_kwargs):`
`304`	`304`	`{`
`305`	`305`	`"A": [str(i) for i in range(N)] * 10,`
`306`	`306`	`"B": list(range(N)) * 10,`
`307`		`- "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),`
`308`	`307`	`}`
`309`	`308`	`)`
	`309`	`+ if isinstance(kwargs.get("window", None), str):`
	`310`	`+ df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10)`
`310`	`311`	`self.groupby_window = getattr(df.groupby("A"), window)(**kwargs)`
`311`	`312`
`312`	`313`	`def time_method(self, method, window_kwargs):`