diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d171a6490ccc..36fd0dda5d2bc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11028,7 +11028,7 @@ def to_timestamp( >>> df2.index DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) """ - new_obj = self.copy(deep=copy) + new_obj = self.copy(deep=copy and not using_copy_on_write()) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) @@ -11085,7 +11085,7 @@ def to_period( >>> idx.to_period("Y") PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') """ - new_obj = self.copy(deep=copy) + new_obj = self.copy(deep=copy and not using_copy_on_write()) axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 65f76f7e295a6..8a34df3385036 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -442,7 +442,7 @@ def set_flags( >>> df2.flags.allows_duplicate_labels False """ - df = self.copy(deep=copy) + df = self.copy(deep=copy and not using_copy_on_write()) if allows_duplicate_labels is not None: df.flags["allows_duplicate_labels"] = allows_duplicate_labels return df @@ -713,7 +713,7 @@ def _set_axis_nocheck( else: # With copy=False, we create a new object but don't copy the # underlying data. - obj = self.copy(deep=copy) + obj = self.copy(deep=copy and not using_copy_on_write()) setattr(obj, obj._get_axis_name(axis), labels) return obj @@ -742,7 +742,7 @@ def swapaxes( j = self._get_axis_number(axis2) if i == j: - return self.copy(deep=copy) + return self.copy(deep=copy and not using_copy_on_write()) mapping = {i: j, j: i} @@ -999,7 +999,7 @@ def _rename( index = mapper self._check_inplace_and_allows_duplicate_labels(inplace) - result = self if inplace else self.copy(deep=copy) + result = self if inplace else self.copy(deep=copy and not using_copy_on_write()) for axis_no, replacements in enumerate((index, columns)): if replacements is None: @@ -1215,6 +1215,9 @@ class name inplace = validate_bool_kwarg(inplace, "inplace") + if copy and using_copy_on_write(): + copy = False + if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( @@ -5322,6 +5325,8 @@ def reindex( # if all axes that are requested to reindex are equal, then only copy # if indicated must have index names equal here as well as values + if copy and using_copy_on_write(): + copy = False if all( self._get_axis(axis_name).identical(ax) for axis_name, ax in axes.items() @@ -5416,10 +5421,14 @@ def _reindex_with_indexers( # If we've made a copy once, no need to make another one copy = False - if (copy or copy is None) and new_data is self._mgr: + if ( + (copy or copy is None) + and new_data is self._mgr + and not using_copy_on_write() + ): new_data = new_data.copy(deep=copy) elif using_copy_on_write() and new_data is self._mgr: - new_data = new_data.copy(deep=copy) + new_data = new_data.copy(deep=False) return self._constructor(new_data).__finalize__(self) @@ -6239,6 +6248,9 @@ def astype( 2 2020-01-03 dtype: datetime64[ns] """ + if copy and using_copy_on_write(): + copy = False + if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: @@ -9499,6 +9511,8 @@ def _align_series( fill_axis: Axis = 0, ): is_series = isinstance(self, ABCSeries) + if copy and using_copy_on_write(): + copy = False if (not is_series and axis is None) or axis not in [None, 0, 1]: raise ValueError("Must specify axis=0 or 1") @@ -10261,8 +10275,7 @@ def truncate( if isinstance(ax, MultiIndex): setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) - if copy or (copy is None and not using_copy_on_write()): - result = result.copy(deep=copy) + result = result.copy(deep=copy and not using_copy_on_write()) return result @@ -10343,7 +10356,7 @@ def _tz_convert(ax, tz): raise ValueError(f"The level {level} is not valid") ax = _tz_convert(ax, tz) - result = self.copy(deep=copy) + result = self.copy(deep=copy and not using_copy_on_write()) result = result.set_axis(ax, axis=axis, copy=False) return result.__finalize__(self, method="tz_convert") @@ -10525,7 +10538,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): raise ValueError(f"The level {level} is not valid") ax = _tz_localize(ax, tz, ambiguous, nonexistent) - result = self.copy(deep=copy) + result = self.copy(deep=copy and not using_copy_on_write()) result = result.set_axis(ax, axis=axis, copy=False) return result.__finalize__(self, method="tz_localize") diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dfe6a5a94ad58..05ba4170cb329 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -442,6 +442,8 @@ def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> copy = False else: copy = True + elif using_copy_on_write(): + copy = False return self.apply( "astype", @@ -457,6 +459,8 @@ def convert(self: T, copy: bool | None) -> T: copy = False else: copy = True + elif using_copy_on_write(): + copy = False return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6758ab9cb6814..bc8f4b97d539a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -366,6 +366,8 @@ def concat( copy = False else: copy = True + elif copy and using_copy_on_write(): + copy = False op = _Concatenator( objs, diff --git a/pandas/core/series.py b/pandas/core/series.py index c7a86e2b0bf09..02ec7208be0cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4129,7 +4129,7 @@ def swaplevel( {examples} """ assert isinstance(self.index, MultiIndex) - result = self.copy(deep=copy) + result = self.copy(deep=copy and not using_copy_on_write()) result.index = self.index.swaplevel(i, j) return result @@ -5743,7 +5743,7 @@ def to_timestamp( if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_obj = self.copy(deep=copy) + new_obj = self.copy(deep=copy and not using_copy_on_write()) new_index = self.index.to_timestamp(freq=freq, how=how) setattr(new_obj, "index", new_index) return new_obj @@ -5783,7 +5783,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_obj = self.copy(deep=copy) + new_obj = self.copy(deep=copy and not using_copy_on_write()) new_index = self.index.to_period(freq=freq) setattr(new_obj, "index", new_index) return new_obj diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index b6f2f0543cb2b..53d72baf7da4e 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -181,6 +181,21 @@ def test_concat_mixed_series_frame(using_copy_on_write): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("copy", [True, None, False]) +def test_concat_copy_keyword(using_copy_on_write, copy): + df = DataFrame({"a": [1, 2]}) + df2 = DataFrame({"b": [1.5, 2.5]}) + + result = concat([df, df2], axis=1, copy=copy) + + if using_copy_on_write or copy is False: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + + @pytest.mark.parametrize( "func", [ @@ -280,3 +295,18 @@ def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) + + +@pytest.mark.parametrize("copy", [True, None, False]) +def test_merge_copy_keyword(using_copy_on_write, copy): + df = DataFrame({"a": [1, 2]}) + df2 = DataFrame({"b": [3, 4.5]}) + + result = df.merge(df2, copy=copy, left_index=True, right_index=True) + + if using_copy_on_write or copy is False: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b30f8ab4c7b9c..7429a73717470 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -66,6 +66,7 @@ def test_copy_shallow(using_copy_on_write): lambda df, copy: df.rename(columns=str.lower, copy=copy), lambda df, copy: df.reindex(columns=["a", "c"], copy=copy), lambda df, copy: df.reindex_like(df, copy=copy), + lambda df, copy: df.align(df, copy=copy)[0], lambda df, copy: df.set_axis(["a", "b", "c"], axis="index", copy=copy), lambda df, copy: df.rename_axis(index="test", copy=copy), lambda df, copy: df.rename_axis(columns="test", copy=copy), @@ -84,6 +85,7 @@ def test_copy_shallow(using_copy_on_write): "rename", "reindex", "reindex_like", + "align", "set_axis", "rename_axis0", "rename_axis1", @@ -115,15 +117,12 @@ def test_methods_copy_keyword( df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=index) df2 = method(df, copy=copy) - share_memory = (using_copy_on_write and copy is not True) or copy is False + share_memory = using_copy_on_write or copy is False if request.node.callspec.id.startswith("reindex-"): # TODO copy=False without CoW still returns a copy in this case if not using_copy_on_write and not using_array_manager and copy is False: share_memory = False - # TODO copy=True with CoW still returns a view - if using_copy_on_write: - share_memory = True if share_memory: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) @@ -131,6 +130,83 @@ def test_methods_copy_keyword( assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) +@pytest.mark.parametrize("copy", [True, None, False]) +@pytest.mark.parametrize( + "method", + [ + lambda ser, copy: ser.rename(index={0: 100}, copy=copy), + lambda ser, copy: ser.reindex(index=ser.index, copy=copy), + lambda ser, copy: ser.reindex_like(ser, copy=copy), + lambda ser, copy: ser.align(ser, copy=copy)[0], + lambda ser, copy: ser.set_axis(["a", "b", "c"], axis="index", copy=copy), + lambda ser, copy: ser.rename_axis(index="test", copy=copy), + lambda ser, copy: ser.astype("int64", copy=copy), + lambda ser, copy: ser.swaplevel(0, 1, copy=copy), + lambda ser, copy: ser.swapaxes(0, 0, copy=copy), + lambda ser, copy: ser.truncate(0, 5, copy=copy), + lambda ser, copy: ser.infer_objects(copy=copy), + lambda ser, copy: ser.to_timestamp(copy=copy), + lambda ser, copy: ser.to_period(freq="D", copy=copy), + lambda ser, copy: ser.tz_localize("US/Central", copy=copy), + lambda ser, copy: ser.tz_convert("US/Central", copy=copy), + lambda ser, copy: ser.set_flags(allows_duplicate_labels=False, copy=copy), + ], + ids=[ + "rename", + "reindex", + "reindex_like", + "align", + "set_axis", + "rename_axis0", + "astype", + "swaplevel", + "swapaxes", + "truncate", + "infer_objects", + "to_timestamp", + "to_period", + "tz_localize", + "tz_convert", + "set_flags", + ], +) +def test_methods_series_copy_keyword(request, method, copy, using_copy_on_write): + index = None + if "to_timestamp" in request.node.callspec.id: + index = period_range("2012-01-01", freq="D", periods=3) + elif "to_period" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_localize" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_convert" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels") + elif "swaplevel" in request.node.callspec.id: + index = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]]) + + ser = Series([1, 2, 3], index=index) + ser2 = method(ser, copy=copy) + + share_memory = using_copy_on_write or copy is False + + if share_memory: + assert np.shares_memory(get_array(ser2), get_array(ser)) + else: + assert not np.shares_memory(get_array(ser2), get_array(ser)) + + +@pytest.mark.parametrize("copy", [True, None, False]) +def test_transpose_copy_keyword(using_copy_on_write, copy, using_array_manager): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.transpose(copy=copy) + share_memory = using_copy_on_write or copy is False or copy is None + share_memory = share_memory and not using_array_manager + + if share_memory: + assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + + # ----------------------------------------------------------------------------- # DataFrame methods returning new DataFrame using shallow copy @@ -1119,14 +1195,13 @@ def test_set_flags(using_copy_on_write): tm.assert_series_equal(ser, expected) -@pytest.mark.parametrize("copy_kwargs", [{"copy": True}, {}]) @pytest.mark.parametrize("kwargs", [{"mapper": "test"}, {"index": "test"}]) -def test_rename_axis(using_copy_on_write, kwargs, copy_kwargs): +def test_rename_axis(using_copy_on_write, kwargs): df = DataFrame({"a": [1, 2, 3, 4]}, index=Index([1, 2, 3, 4], name="a")) df_orig = df.copy() - df2 = df.rename_axis(**kwargs, **copy_kwargs) + df2 = df.rename_axis(**kwargs) - if using_copy_on_write and not copy_kwargs: + if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index ceea53e3dd8bf..52e841a8c569a 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -149,7 +149,10 @@ def test_reindex_copies_ea(self, using_copy_on_write): # pass both columns and index result2 = df.reindex(columns=cols, index=df.index, copy=True) - assert not np.shares_memory(result2[0].array._data, df[0].array._data) + if using_copy_on_write: + assert np.shares_memory(result2[0].array._data, df[0].array._data) + else: + assert not np.shares_memory(result2[0].array._data, df[0].array._data) @td.skip_array_manager_not_yet_implemented def test_reindex_date_fill_value(self): diff --git a/pandas/tests/frame/methods/test_set_axis.py b/pandas/tests/frame/methods/test_set_axis.py index fd140e0098f2a..2fc629b14a50e 100644 --- a/pandas/tests/frame/methods/test_set_axis.py +++ b/pandas/tests/frame/methods/test_set_axis.py @@ -33,13 +33,14 @@ def test_set_axis_copy(self, obj, using_copy_on_write): tm.assert_equal(expected, result) assert result is not obj # check we DID make a copy - if obj.ndim == 1: - assert not tm.shares_memory(result, obj) - else: - assert not any( - tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) - for i in range(obj.shape[1]) - ) + if not using_copy_on_write: + if obj.ndim == 1: + assert not tm.shares_memory(result, obj) + else: + assert not any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) result = obj.set_axis(new_index, axis=0, copy=False) tm.assert_equal(expected, result) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index b08d0a33d08c6..44b02310eb8a7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -59,8 +59,12 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) - for arr in result._mgr.arrays: - assert arr.base is None + if not using_copy_on_write: + for arr in result._mgr.arrays: + assert arr.base is None + else: + for arr in result._mgr.arrays: + assert arr.base is not None # These are the same. result = concat([df, df2, df3], axis=1, copy=False) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 23a49c33099cb..105ffe84a0703 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -195,15 +195,16 @@ def test_concat_duplicates_in_index_with_keys(self): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("axis", [0, 1]) - def test_concat_copies(self, axis, order, ignore_index): + def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write): # based on asv ConcatDataFrames df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order)) res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True) - for arr in res._iter_column_arrays(): - for arr2 in df._iter_column_arrays(): - assert not np.shares_memory(arr, arr2) + if not using_copy_on_write: + for arr in res._iter_column_arrays(): + for arr2 in df._iter_column_arrays(): + assert not np.shares_memory(arr, arr2) def test_outer_sort_columns(self): # GH#47127 diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index e0ea09138ef3c..ce06e74de91b9 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -100,18 +100,28 @@ def test_concat_rename_index(self): tm.assert_frame_equal(result, exp) assert result.index.names == exp.index.names - def test_concat_copy_index_series(self, axis): + def test_concat_copy_index_series(self, axis, using_copy_on_write): # GH 29879 ser = Series([1, 2]) comb = concat([ser, ser], axis=axis, copy=True) - assert comb.index is not ser.index + if not using_copy_on_write or axis in [0, "index"]: + assert comb.index is not ser.index + else: + assert comb.index is ser.index - def test_concat_copy_index_frame(self, axis): + def test_concat_copy_index_frame(self, axis, using_copy_on_write): # GH 29879 df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) comb = concat([df, df], axis=axis, copy=True) - assert comb.index is not df.index - assert comb.columns is not df.columns + if not using_copy_on_write: + assert comb.index is not df.index + assert comb.columns is not df.columns + elif axis in [0, "index"]: + assert comb.index is not df.index + assert comb.columns is df.columns + elif axis in [1, "columns"]: + assert comb.index is df.index + assert comb.columns is not df.columns def test_default_index(self): # is_series and ignore_index diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index b2e03684bc902..7f34f4046d33c 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -118,14 +118,18 @@ def test_align_nocopy(datetime_series, using_copy_on_write): assert (b[:2] == 5).all() -def test_align_same_index(datetime_series): +def test_align_same_index(datetime_series, using_copy_on_write): a, b = datetime_series.align(datetime_series, copy=False) assert a.index is datetime_series.index assert b.index is datetime_series.index a, b = datetime_series.align(datetime_series, copy=True) - assert a.index is not datetime_series.index - assert b.index is not datetime_series.index + if not using_copy_on_write: + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + else: + assert a.index is datetime_series.index + assert b.index is datetime_series.index def test_align_multiindex():