From c504935cc3f85066caa0c98d8b0925d232071eec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 18 Feb 2024 17:51:40 +0100 Subject: [PATCH 1/4] CoW: Track references in unstack if there is no copy --- pandas/core/reshape/reshape.py | 17 ++++++++++++----- pandas/tests/reshape/test_pivot.py | 13 +++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e8ef393da38d9..81dba1fb8f5db 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -229,20 +229,27 @@ def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: return new_values, mask.any(0) # TODO: in all tests we have mask.any(0).all(); can we rely on that? - def get_result(self, values, value_columns, fill_value) -> DataFrame: + def get_result(self, obj, value_columns, fill_value) -> DataFrame: + values = obj._values if values.ndim == 1: values = values[:, np.newaxis] if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError("must pass column labels for multi-column data") - values, _ = self.get_new_values(values, fill_value) + new_values, _ = self.get_new_values(values, fill_value) columns = self.get_new_columns(value_columns) index = self.new_index - return self.constructor( - values, index=index, columns=columns, dtype=values.dtype + result = self.constructor( + new_values, index=index, columns=columns, dtype=values.dtype, copy=False ) + if values.base is new_values.base: + # We can only get here if one of the dimensions is size 1 + mgr = result._mgr + mgr.blocks[0].refs = obj._mgr.blocks[0].refs + mgr.blocks[0].refs.add_reference(mgr.blocks[0]) + return result def get_new_values(self, values, fill_value=None): if values.ndim == 1: @@ -550,7 +557,7 @@ def _unstack_frame( return obj._constructor_from_mgr(mgr, axes=mgr.axes) else: return unstacker.get_result( - obj._values, value_columns=obj.columns, fill_value=fill_value + obj, value_columns=obj.columns, fill_value=fill_value ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d6b61bae850af..270b654149c09 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2695,3 +2695,16 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): index=Index(["a", "b", "All"], name=0), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("m", [1, 10]) + def test_unstack_shares_memory(self, m): + # GH#56633 + levels = np.arange(m) + index = MultiIndex.from_product([levels] * 2) + values = np.arange(m * m * 100).reshape(m * m, 100) + df = DataFrame(values, index, np.arange(100)) + df_orig = df.copy() + result = df.unstack(sort=False) + assert np.shares_memory(df._values, result._values) is (m == 1) + result.iloc[0, 0] = -1 + tm.assert_frame_equal(df, df_orig) From 19dcda6e65ef020d893848e7b9d72e22b6f5f669 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 18 Feb 2024 17:59:07 +0100 Subject: [PATCH 2/4] Update --- pandas/core/reshape/reshape.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 81dba1fb8f5db..efb4fdb7dc230 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -33,6 +33,7 @@ factorize, unique, ) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame @@ -242,9 +243,15 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame: index = self.new_index result = self.constructor( - new_values, index=index, columns=columns, dtype=values.dtype, copy=False + new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False ) - if values.base is new_values.base: + if isinstance(values, np.ndarray): + base, new_base = values.base, new_values.base + elif isinstance(values, NDArrayBackedExtensionArray): + base, new_base = values._ndarray.base, new_values._ndarray.base + else: + base, new_base = 1, 2 + if base is new_base: # We can only get here if one of the dimensions is size 1 mgr = result._mgr mgr.blocks[0].refs = obj._mgr.blocks[0].refs @@ -539,9 +546,7 @@ def unstack( unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort ) - return unstacker.get_result( - obj._values, value_columns=None, fill_value=fill_value - ) + return unstacker.get_result(obj, value_columns=None, fill_value=fill_value) def _unstack_frame( From 1572bdb67c1e5f2b6e3f368ad6640ea68b41a54c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 18 Feb 2024 21:04:50 +0100 Subject: [PATCH 3/4] Update --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index efb4fdb7dc230..822583f271c82 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -250,7 +250,7 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame: elif isinstance(values, NDArrayBackedExtensionArray): base, new_base = values._ndarray.base, new_values._ndarray.base else: - base, new_base = 1, 2 + base, new_base = 1, 2 # type: ignore[assignment] if base is new_base: # We can only get here if one of the dimensions is size 1 mgr = result._mgr From a24201b5a4aba9bd9fed8e7125878819fc2ad778 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Feb 2024 23:48:27 +0100 Subject: [PATCH 4/4] Update --- pandas/core/reshape/reshape.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 822583f271c82..0f9fa0a58ccb7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -253,9 +253,7 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame: base, new_base = 1, 2 # type: ignore[assignment] if base is new_base: # We can only get here if one of the dimensions is size 1 - mgr = result._mgr - mgr.blocks[0].refs = obj._mgr.blocks[0].refs - mgr.blocks[0].refs.add_reference(mgr.blocks[0]) + result._mgr.add_references(obj._mgr) return result def get_new_values(self, values, fill_value=None):