From 2143ad396e005400060c50630f651f263b9e9341 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 2 Oct 2021 10:15:24 -0400 Subject: [PATCH 1/2] REGR: RollingGroupby.corr() producing incorrect results --- doc/source/whatsnew/v1.3.4.rst | 2 ++ pandas/core/window/rolling.py | 5 ++-- pandas/tests/window/test_groupby.py | 36 +++++++++++++++++++++++++++++ pandas/tests/window/test_rolling.py | 10 ++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 05667264ad9af..6f07dc3e1e2f9 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -24,6 +24,8 @@ Fixed regressions - Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`) - Fixed regression in :meth:`DataFrame.explode` raising ``AssertionError`` when ``column`` is any scalar which is not a string (:issue:`43314`) - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) +- Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`) +- Fixed regression in :meth:`DataFrame.groupby.rolling.cov` and :meth:`DataFrame.groupby.rolling.corr` computing incorrect results if the input groupings were not sorted (:issue:`43386`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ea40e8d816f45..2b8ed3c97d026 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -301,8 +301,8 @@ def __repr__(self) -> str: return f"{type(self).__name__} [{attrs}]" def __iter__(self): - obj = self._create_data(self._selected_obj) - obj = obj.set_axis(self._on) + obj = self._selected_obj.set_axis(self._on) + obj = self._create_data(obj) indexer = self._get_window_indexer() start, end = indexer.get_window_bounds( @@ -727,6 +727,7 @@ def _apply_pairwise( """ # Manually drop the grouping column first target = target.drop(columns=self._grouper.names, errors="ignore") + target = self._create_data(target) result = super()._apply_pairwise(target, other, pairwise, func) # 1) Determine the levels + codes of the groupby levels if other is not None: diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 30f27db6dc2d2..2077d2a210765 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -146,6 +146,42 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "func, expected_values", + [("cov", [[1.0, 1.0], [1.0, 4.0]]), ("corr", [[1.0, 0.5], [0.5, 1.0]])], + ) + def test_rolling_corr_cov_unordered(self, func, expected_values): + # GH 43386 + df = DataFrame( + { + "a": ["g1", "g2", "g1", "g1"], + "b": [0, 0, 1, 2], + "c": [2, 0, 6, 4], + } + ) + rol = df.groupby("a").rolling(3) + result = getattr(rol, func)() + expected = DataFrame( + { + "b": 4 * [np.nan] + expected_values[0] + 2 * [np.nan], + "c": 4 * [np.nan] + expected_values[1] + 2 * [np.nan], + }, + index=MultiIndex.from_tuples( + [ + ("g1", 0, "b"), + ("g1", 0, "c"), + ("g1", 2, "b"), + ("g1", 2, "c"), + ("g1", 3, "b"), + ("g1", 3, "c"), + ("g2", 1, "b"), + ("g2", 1, "c"), + ], + names=["a", None, None], + ), + ) + tm.assert_frame_equal(result, expected) + def test_rolling_apply(self, raw): g = self.frame.groupby("A") r = g.rolling(window=4) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index ed1039223e831..f908b680cab0d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -792,6 +792,16 @@ def test_iter_rolling_on_dataframe(expected, window): tm.assert_frame_equal(actual, expected) +def test_iter_rolling_on_dataframe_unordered(): + # GH 43386 + df = DataFrame({"a": ["x", "y", "x"], "b": [0, 1, 2]}) + results = list(df.groupby("a").rolling(2)) + indexes = [[0], [0, 2], [1]] + expecteds = [df.iloc[idx, [1]] for idx in indexes] + for result, expected in zip(results, expecteds): + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "ser,expected,window, min_periods", [ From 7fc79553d05cc472ed6ee9bbaa38139552ecdcba Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 2 Oct 2021 12:49:21 -0400 Subject: [PATCH 2/2] test cleanup --- pandas/tests/window/test_rolling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f908b680cab0d..1631c9f0e2ffd 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -796,8 +796,7 @@ def test_iter_rolling_on_dataframe_unordered(): # GH 43386 df = DataFrame({"a": ["x", "y", "x"], "b": [0, 1, 2]}) results = list(df.groupby("a").rolling(2)) - indexes = [[0], [0, 2], [1]] - expecteds = [df.iloc[idx, [1]] for idx in indexes] + expecteds = [df.iloc[idx, [1]] for idx in [[0], [0, 2], [1]]] for result, expected in zip(results, expecteds): tm.assert_frame_equal(result, expected)