From ed83c88b373c80b62f7c667b9853dbdfa650c8f5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 20:12:13 +0000 Subject: [PATCH 1/2] BUG: CoW not tracking references when indexing midx with slice --- doc/source/whatsnew/v2.0.0.rst | 3 +++ pandas/core/frame.py | 5 ++++- pandas/tests/copy_view/test_indexing.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a8277a4336a2b..b308c1c70f1e5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -206,6 +206,9 @@ Copy-on-Write improvements - Arithmetic operations that can be inplace, e.g. ``ser *= 2`` will now respect the Copy-on-Write mechanism. +- :meth:`DataFrame.__getitem__` will now respect the Copy-on-Write mechanism when the + :class:`DataFrame` has :class:`MultiIndex` columns. + Copy-on-Write can be enabled through one of .. code-block:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70019030da182..bd50b13d1e95d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3832,10 +3832,13 @@ def _getitem_multilevel(self, key): result = self.reindex(columns=new_columns) result.columns = result_columns else: - new_values = self.values[:, loc] + new_values = self._values[:, loc] result = self._constructor( new_values, index=self.index, columns=result_columns ) + if using_copy_on_write() and isinstance(loc, slice): + result._mgr.add_references(self._mgr) + result = result.__finalize__(self) # If there is only one column being returned, and its name is diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index ca2954f0d390e..bb7296fa3cb5f 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1034,3 +1034,17 @@ def test_set_value_copy_only_necessary_column( assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) else: assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + +def test_getitem_midx_slice(using_copy_on_write): + df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) + df_orig = df.copy() + new_df = df[("a",)] + + if using_copy_on_write: + assert not new_df._mgr._has_no_reference(0) + + assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) + if using_copy_on_write: + new_df.iloc[0, 0] = 100 + tm.assert_frame_equal(df_orig, df) From 6f58ad72553935d3d313d4128ac897fa7b9169e3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 12:02:58 +0100 Subject: [PATCH 2/2] Fix mypy and array manager test --- pandas/core/frame.py | 2 +- pandas/tests/copy_view/test_indexing.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 53705dd45f90d..650df0d18e54d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3837,7 +3837,7 @@ def _getitem_multilevel(self, key): new_values, index=self.index, columns=result_columns ) if using_copy_on_write() and isinstance(loc, slice): - result._mgr.add_references(self._mgr) + result._mgr.add_references(self._mgr) # type: ignore[arg-type] result = result.__finalize__(self) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index bb7296fa3cb5f..17b9c16ce1e15 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1036,7 +1036,7 @@ def test_set_value_copy_only_necessary_column( assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) -def test_getitem_midx_slice(using_copy_on_write): +def test_getitem_midx_slice(using_copy_on_write, using_array_manager): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] @@ -1044,7 +1044,8 @@ def test_getitem_midx_slice(using_copy_on_write): if using_copy_on_write: assert not new_df._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) + if not using_array_manager: + assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df)