Skip to content

API / CoW: shallow copy of DataFrame/Series (.copy(deep=False)) also returns shallow copy of the index/columns #53722

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ Copy-on-Write improvements
- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
of Index objects and specifying ``copy=False``, will now use a lazy copy
of those Index objects for the columns of the DataFrame (:issue:`52947`)
- A shallow copy of a Series or DataFrame (``df.copy(deep=False)``) will now also return
a shallow copy of the rows/columns ``Index`` objects instead of only a shallow copy of
the data, i.e. the index of the result is no longer identical
(``df.copy(deep=False).index is df.index`` is no longer True) (:issue:`53721`)
- :meth:`DataFrame.head` and :meth:`DataFrame.tail` will now return deep copies (:issue:`54011`)
- Add lazy copy mechanism to :meth:`DataFrame.eval` (:issue:`53746`)

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,10 @@ def copy_func(ax):

new_axes = [copy_func(ax) for ax in self.axes]
else:
new_axes = list(self.axes)
if using_copy_on_write():
new_axes = [ax.view() for ax in self.axes]
else:
new_axes = list(self.axes)

res = self.apply("copy", deep=deep)
res.axes = new_axes
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ def test_copy(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_copy = df.copy()

# the deep copy by defaults takes a shallow copy of the Index
assert df_copy.index is not df.index
assert df_copy.columns is not df.columns
assert df_copy.index.is_(df.index)
assert df_copy.columns.is_(df.columns)

# the deep copy doesn't share memory
assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
if using_copy_on_write:
Expand All @@ -37,6 +43,16 @@ def test_copy_shallow(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_copy = df.copy(deep=False)

# the shallow copy also makes a shallow copy of the index
if using_copy_on_write:
assert df_copy.index is not df.index
assert df_copy.columns is not df.columns
assert df_copy.index.is_(df.index)
assert df_copy.columns.is_(df.columns)
else:
assert df_copy.index is df.index
assert df_copy.columns is df.columns

# the shallow copy still shares memory
assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
if using_copy_on_write:
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def test_reindex_sparse(self):
)
tm.assert_frame_equal(result, expected)

def test_reindex(self, float_frame):
def test_reindex(self, float_frame, using_copy_on_write):
datetime_series = tm.makeTimeSeries(nper=30)

newFrame = float_frame.reindex(datetime_series.index)
Expand Down Expand Up @@ -647,7 +647,10 @@ def test_reindex(self, float_frame):

# Same index, copies values but not index if copy=False
newFrame = float_frame.reindex(float_frame.index, copy=False)
assert newFrame.index is float_frame.index
if using_copy_on_write:
assert newFrame.index.is_(float_frame.index)
else:
assert newFrame.index is float_frame.index

# length zero
newFrame = float_frame.reindex([])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_npfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def test_np_sqrt(self, float_frame):
with np.errstate(all="ignore"):
result = np.sqrt(float_frame)
assert isinstance(result, type(float_frame))
assert result.index is float_frame.index
assert result.columns is float_frame.columns
assert result.index.is_(float_frame.index)
assert result.columns.is_(float_frame.columns)

tm.assert_frame_equal(result, float_frame.apply(np.sqrt))

Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/reshape/concat/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def test_concat_copy_index_frame(self, axis, using_copy_on_write):
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
comb = concat([df, df], axis=axis, copy=True)
if not using_copy_on_write:
assert comb.index is not df.index
assert comb.columns is not df.columns
assert not comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)
elif axis in [0, "index"]:
assert comb.index is not df.index
assert comb.columns is df.columns
assert not comb.index.is_(df.index)
assert comb.columns.is_(df.columns)
elif axis in [1, "columns"]:
assert comb.index is df.index
assert comb.columns is not df.columns
assert comb.index.is_(df.index)
assert not comb.columns.is_(df.columns)

def test_default_index(self):
# is_series and ignore_index
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/series/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,18 @@ def test_align_nocopy(datetime_series, using_copy_on_write):

def test_align_same_index(datetime_series, using_copy_on_write):
a, b = datetime_series.align(datetime_series, copy=False)
assert a.index is datetime_series.index
assert b.index is datetime_series.index

a, b = datetime_series.align(datetime_series, copy=True)
if not using_copy_on_write:
assert a.index is not datetime_series.index
assert b.index is not datetime_series.index
else:
assert a.index is datetime_series.index
assert b.index is datetime_series.index
else:
assert a.index.is_(datetime_series.index)
assert b.index.is_(datetime_series.index)

a, b = datetime_series.align(datetime_series, copy=True)
assert a.index is not datetime_series.index
assert b.index is not datetime_series.index
assert a.index.is_(datetime_series.index)
assert b.index.is_(datetime_series.index)


def test_align_multiindex():
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,12 +628,15 @@ def test_constructor_maskedarray_hardened(self):
expected = Series([np.nan, np.nan, np.nan])
tm.assert_series_equal(result, expected)

def test_series_ctor_plus_datetimeindex(self):
def test_series_ctor_plus_datetimeindex(self, using_copy_on_write):
rng = date_range("20090415", "20090519", freq="B")
data = {k: 1 for k in rng}

result = Series(data, index=rng)
assert result.index is rng
if using_copy_on_write:
assert result.index.is_(rng)
else:
assert result.index is rng

def test_constructor_default_index(self):
s = Series([0, 1, 2])
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,26 @@ def test_reindex(self, multiindex_dataframe_random_data):
tm.assert_frame_equal(reindexed, expected)

def test_reindex_preserve_levels(
self, multiindex_year_month_day_dataframe_random_data
self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write
):
ymd = multiindex_year_month_day_dataframe_random_data

new_index = ymd.index[::10]
chunk = ymd.reindex(new_index)
assert chunk.index is new_index
if using_copy_on_write:
assert chunk.index.is_(new_index)
else:
assert chunk.index is new_index

chunk = ymd.loc[new_index]
assert chunk.index.equals(new_index)

ymdT = ymd.T
chunk = ymdT.reindex(columns=new_index)
assert chunk.columns is new_index
if using_copy_on_write:
assert chunk.columns.is_(new_index)
else:
assert chunk.columns is new_index

chunk = ymdT.loc[:, new_index]
assert chunk.columns.equals(new_index)
Expand Down