From f0a23333423ae0ece6373843ed3ac4567c367886 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Jun 2023 09:34:38 +0200 Subject: [PATCH 1/2] API / CoW: shallow copy of DataFrame/Series (.copy(deep=False)) also returns shallow copy of the index/columns --- pandas/core/internals/managers.py | 5 ++++- pandas/tests/copy_view/test_methods.py | 16 ++++++++++++++++ pandas/tests/frame/methods/test_reindex.py | 7 +++++-- pandas/tests/frame/test_npfuncs.py | 4 ++-- pandas/tests/reshape/concat/test_index.py | 12 ++++++------ pandas/tests/series/methods/test_align.py | 16 +++++++++------- pandas/tests/series/test_constructors.py | 7 +++++-- pandas/tests/test_multilevel.py | 12 +++++++++--- 8 files changed, 56 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2a7c0536c66a4..181b1b7966fac 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -647,7 +647,10 @@ def copy_func(ax): new_axes = [copy_func(ax) for ax in self.axes] else: - new_axes = list(self.axes) + if using_copy_on_write(): + new_axes = [ax.view() for ax in self.axes] + else: + new_axes = list(self.axes) res = self.apply("copy", deep=deep) res.axes = new_axes diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 67fc91e0567ef..d630fe51ccc18 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -22,6 +22,12 @@ def test_copy(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy() + # the deep copy by defaults takes a shallow copy of the Index + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + assert df_copy.index.is_(df.index) + assert df_copy.columns.is_(df.columns) + # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: @@ -37,6 +43,16 @@ def test_copy_shallow(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) + # the shallow copy also makes a shallow copy of the index + if using_copy_on_write: + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + assert df_copy.index.is_(df.index) + assert df_copy.columns.is_(df.columns) + else: + assert df_copy.index is df.index + assert df_copy.columns is df.columns + # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 63e2eb790a4ea..eb3e3e08a67d5 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -599,7 +599,7 @@ def test_reindex_sparse(self): ) tm.assert_frame_equal(result, expected) - def test_reindex(self, float_frame): + def test_reindex(self, float_frame, using_copy_on_write): datetime_series = tm.makeTimeSeries(nper=30) newFrame = float_frame.reindex(datetime_series.index) @@ -639,7 +639,10 @@ def test_reindex(self, float_frame): # Same index, copies values but not index if copy=False newFrame = float_frame.reindex(float_frame.index, copy=False) - assert newFrame.index is float_frame.index + if using_copy_on_write: + assert newFrame.index.is_(float_frame.index) + else: + assert newFrame.index is float_frame.index # length zero newFrame = float_frame.reindex([]) diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py index b734dafb6c31b..1159623bdd281 100644 --- a/pandas/tests/frame/test_npfuncs.py +++ b/pandas/tests/frame/test_npfuncs.py @@ -22,8 +22,8 @@ def test_np_sqrt(self, float_frame): with np.errstate(all="ignore"): result = np.sqrt(float_frame) assert isinstance(result, type(float_frame)) - assert result.index is float_frame.index - assert result.columns is float_frame.columns + assert result.index.is_(float_frame.index) + assert result.columns.is_(float_frame.columns) tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index ce06e74de91b9..b4f2a5e13aac4 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -114,14 +114,14 @@ def test_concat_copy_index_frame(self, axis, using_copy_on_write): df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) comb = concat([df, df], axis=axis, copy=True) if not using_copy_on_write: - assert comb.index is not df.index - assert comb.columns is not df.columns + assert not comb.index.is_(df.index) + assert not comb.columns.is_(df.columns) elif axis in [0, "index"]: - assert comb.index is not df.index - assert comb.columns is df.columns + assert not comb.index.is_(df.index) + assert comb.columns.is_(df.columns) elif axis in [1, "columns"]: - assert comb.index is df.index - assert comb.columns is not df.columns + assert comb.index.is_(df.index) + assert not comb.columns.is_(df.columns) def test_default_index(self): # is_series and ignore_index diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index 3edbe1b2f61f3..ceb4d409ca2e6 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -127,16 +127,18 @@ def test_align_nocopy(datetime_series, using_copy_on_write): def test_align_same_index(datetime_series, using_copy_on_write): a, b = datetime_series.align(datetime_series, copy=False) - assert a.index is datetime_series.index - assert b.index is datetime_series.index - - a, b = datetime_series.align(datetime_series, copy=True) if not using_copy_on_write: - assert a.index is not datetime_series.index - assert b.index is not datetime_series.index - else: assert a.index is datetime_series.index assert b.index is datetime_series.index + else: + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) + + a, b = datetime_series.align(datetime_series, copy=True) + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + assert a.index.is_(datetime_series.index) + assert b.index.is_(datetime_series.index) def test_align_multiindex(): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4bf16b6d20d1f..99f772ead4262 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -628,12 +628,15 @@ def test_constructor_maskedarray_hardened(self): expected = Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) - def test_series_ctor_plus_datetimeindex(self): + def test_series_ctor_plus_datetimeindex(self, using_copy_on_write): rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) - assert result.index is rng + if using_copy_on_write: + assert result.index.is_(rng) + else: + assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8c5f9a894f2f7..b97fa2f9bc16f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -46,20 +46,26 @@ def test_reindex(self, multiindex_dataframe_random_data): tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write ): ymd = multiindex_year_month_day_dataframe_random_data new_index = ymd.index[::10] chunk = ymd.reindex(new_index) - assert chunk.index is new_index + if using_copy_on_write: + assert chunk.index.is_(new_index) + else: + assert chunk.index is new_index chunk = ymd.loc[new_index] assert chunk.index.equals(new_index) ymdT = ymd.T chunk = ymdT.reindex(columns=new_index) - assert chunk.columns is new_index + if using_copy_on_write: + assert chunk.columns.is_(new_index) + else: + assert chunk.columns is new_index chunk = ymdT.loc[:, new_index] assert chunk.columns.equals(new_index) From da29e453081f88944db1109acd30be734a6d3f42 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 Jul 2023 11:26:32 +0200 Subject: [PATCH 2/2] add whatsnew --- doc/source/whatsnew/v2.1.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6390fbeed8548..0774143c4424e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -25,6 +25,10 @@ Copy-on-Write improvements - The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary of Index objects and specifying ``copy=False``, will now use a lazy copy of those Index objects for the columns of the DataFrame (:issue:`52947`) +- A shallow copy of a Series or DataFrame (``df.copy(deep=False)``) will now also return + a shallow copy of the rows/columns ``Index`` objects instead of only a shallow copy of + the data, i.e. the index of the result is no longer identical + (``df.copy(deep=False).index is df.index`` is no longer True) (:issue:`53721`) - Add lazy copy mechanism to :meth:`DataFrame.eval` (:issue:`53746`) .. _whatsnew_210.enhancements.enhancement2: