From 4f7c49eee3482491b8a042d26ce75f34e9a0622b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 27 Aug 2021 12:29:20 -0700 Subject: [PATCH 1/5] BUG: rolling.corr with MultiIndex columns --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/window/common.py | 14 ++++++++++++-- pandas/tests/window/test_rolling.py | 16 ++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fc488504f1fdf..a33467483676d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -352,7 +352,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.DataFrame.ewm`, where non-float64 dtypes were silently failing (:issue:`42452`) - Bug in :meth:`pandas.DataFrame.rolling` operation along rows (``axis=1``) incorrectly omits columns containing ``float16`` and ``float32`` (:issue:`41779`) - Bug in :meth:`Resampler.aggregate` did not allow the use of Named Aggregation (:issue:`32803`) -- +- Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index e0720c5d86df1..3f36f73c5ac37 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -83,8 +83,18 @@ def dataframe_from_int_dict(data, frame_template): # mypy needs to know columns is a MultiIndex, Index doesn't # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) - result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index] + # GH 21157 + idx_codes, idx_uniques = result_index.factorize() + result_levels = list(arg2.columns.levels) + [idx_uniques] + result_codes = [ + np.tile(code, int(len(result) / len(code))) + for code in arg2.columns.codes + ] + [np.tile(idx_codes, int(len(result) / len(idx_codes)))] + result_names = list(arg2.columns.names) + [result_index.name] + result.index = MultiIndex( + levels=result_levels, + codes=result_codes, + names=result_names, ) # GH 34440 num_levels = len(result.index.levels) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2edf22d96a9ba..2fb6b61d46b83 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1500,3 +1500,19 @@ def test_rolling_numeric_dtypes(): dtype="float64", ) tm.assert_frame_equal(result, expected) + + +def test_multindex_columns_pairwise_func(): + # GH 21157 + columns = MultiIndex.from_arrays([["M", "N"], ["P", "Q"]], names=["a", "b"]) + df = DataFrame(np.ones((5, 2)), columns=columns) + result = df.rolling(3).corr() + expected = DataFrame( + np.nan, + index=MultiIndex.from_arrays( + [np.repeat(np.arange(5), 2), ["M", "N"] * 5, ["P", "Q"] * 5], + names=[None, "a", "b"], + ), + columns=columns, + ) + tm.assert_frame_equal(result, expected) From 027907e35279ed7c929e0e951c77c5a11c59854f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 31 Aug 2021 22:18:38 -0700 Subject: [PATCH 2/5] Add commentary of fix --- pandas/core/window/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 3f36f73c5ac37..adf910d9ee2c0 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -83,7 +83,11 @@ def dataframe_from_int_dict(data, frame_template): # mypy needs to know columns is a MultiIndex, Index doesn't # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) - # GH 21157 + # GH 21157: Equivalent to MultiIndex.from_product( + # , [result_index] + # ) + # A normal MultiIndex.from_product will produce too many + # combinations. idx_codes, idx_uniques = result_index.factorize() result_levels = list(arg2.columns.levels) + [idx_uniques] result_codes = [ From 474ef078fcf097cc488a0a1e4a711c3df49b0c83 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 2 Sep 2021 17:52:23 -0700 Subject: [PATCH 3/5] Trigger CI From f9fe6806f3404ad03d3098561ae2cc11b62cc54d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 6 Sep 2021 14:35:09 -0700 Subject: [PATCH 4/5] Use from_arrays --- pandas/core/window/common.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index adf910d9ee2c0..7e80fc258ce09 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -84,26 +84,24 @@ def dataframe_from_int_dict(data, frame_template): # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) # GH 21157: Equivalent to MultiIndex.from_product( - # , [result_index] + # [result_index], , # ) # A normal MultiIndex.from_product will produce too many # combinations. - idx_codes, idx_uniques = result_index.factorize() - result_levels = list(arg2.columns.levels) + [idx_uniques] - result_codes = [ - np.tile(code, int(len(result) / len(code))) - for code in arg2.columns.codes - ] + [np.tile(idx_codes, int(len(result) / len(idx_codes)))] - result_names = list(arg2.columns.names) + [result_index.name] - result.index = MultiIndex( - levels=result_levels, - codes=result_codes, - names=result_names, + result_level = np.repeat( + result_index, len(result) // len(result_index) + ) + arg2_levels = ( + np.tile( + arg2.columns.get_level_values(i), + len(result) // len(arg2.columns), + ) + for i in range(arg2.columns.nlevels) + ) + result_names = [result_index.name] + list(arg2.columns.names) + result.index = MultiIndex.from_arrays( + [result_level, *arg2_levels], names=result_names ) - # GH 34440 - num_levels = len(result.index.levels) - new_order = [num_levels - 1] + list(range(num_levels - 1)) - result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product( [range(len(arg2.columns)), range(len(result_index))] From 051d0c66eed387e51bfa94aecaf49777f670461e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 6 Sep 2021 16:40:09 -0700 Subject: [PATCH 5/5] Consider reordering --- pandas/core/window/common.py | 12 ++++++++---- pandas/tests/window/test_pairwise.py | 15 +++++++++++++++ pandas/tests/window/test_rolling.py | 16 ---------------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 7e80fc258ce09..15144116fa924 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -88,20 +88,24 @@ def dataframe_from_int_dict(data, frame_template): # ) # A normal MultiIndex.from_product will produce too many # combinations. - result_level = np.repeat( + result_level = np.tile( result_index, len(result) // len(result_index) ) arg2_levels = ( - np.tile( + np.repeat( arg2.columns.get_level_values(i), len(result) // len(arg2.columns), ) for i in range(arg2.columns.nlevels) ) - result_names = [result_index.name] + list(arg2.columns.names) + result_names = list(arg2.columns.names) + [result_index.name] result.index = MultiIndex.from_arrays( - [result_level, *arg2_levels], names=result_names + [*arg2_levels, result_level], names=result_names ) + # GH 34440 + num_levels = len(result.index.levels) + new_order = [num_levels - 1] + list(range(num_levels - 1)) + result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product( [range(len(arg2.columns)), range(len(result_index))] diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index a0d24a061fc4a..f43d7ec99e312 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -222,3 +222,18 @@ def test_cov_mulittindex(self): ) tm.assert_frame_equal(result, expected) + + def test_multindex_columns_pairwise_func(self): + # GH 21157 + columns = MultiIndex.from_arrays([["M", "N"], ["P", "Q"]], names=["a", "b"]) + df = DataFrame(np.ones((5, 2)), columns=columns) + result = df.rolling(3).corr() + expected = DataFrame( + np.nan, + index=MultiIndex.from_arrays( + [np.repeat(np.arange(5), 2), ["M", "N"] * 5, ["P", "Q"] * 5], + names=[None, "a", "b"], + ), + columns=columns, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2fb6b61d46b83..2edf22d96a9ba 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1500,19 +1500,3 @@ def test_rolling_numeric_dtypes(): dtype="float64", ) tm.assert_frame_equal(result, expected) - - -def test_multindex_columns_pairwise_func(): - # GH 21157 - columns = MultiIndex.from_arrays([["M", "N"], ["P", "Q"]], names=["a", "b"]) - df = DataFrame(np.ones((5, 2)), columns=columns) - result = df.rolling(3).corr() - expected = DataFrame( - np.nan, - index=MultiIndex.from_arrays( - [np.repeat(np.arange(5), 2), ["M", "N"] * 5, ["P", "Q"] * 5], - names=[None, "a", "b"], - ), - columns=columns, - ) - tm.assert_frame_equal(result, expected)