From 563257ed285af46e156fef8e86c23e936a8918af Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 22:48:55 +0100 Subject: [PATCH 01/21] Implement copy by default for numpy array --- pandas/core/construction.py | 7 ++++++- pandas/core/dtypes/cast.py | 12 ++++++++++-- pandas/core/internals/construction.py | 10 ++++++++++ pandas/tests/frame/methods/test_set_index.py | 2 ++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8c5f291742b9b..b7848b2ba65ce 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -18,6 +18,8 @@ import numpy as np from numpy import ma +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.tslibs.period import Period from pandas._typing import ( @@ -762,6 +764,9 @@ def _try_cast( subarr = maybe_cast_to_integer_array(arr, dtype) else: - subarr = np.array(arr, dtype=dtype, copy=copy) + if using_copy_on_write(): + subarr = np.array(arr, dtype=dtype, copy=copy, order="F") + else: + subarr = np.array(arr, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5877c60df41fd..aa964eac9061f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -19,6 +19,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -1594,11 +1596,17 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n "NumPy will stop allowing conversion of out-of-bound Python int", DeprecationWarning, ) - casted = np.array(arr, dtype=dtype, copy=False) + if using_copy_on_write(): + casted = np.array(arr, dtype=dtype, copy=False, order="F") + else: + casted = np.array(arr, dtype=dtype, copy=False) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) - casted = arr.astype(dtype, copy=False) + if using_copy_on_write(): + casted = arr.astype(dtype, copy=False, order="F") + else: + casted = arr.astype(dtype, copy=False) except OverflowError as err: raise OverflowError( "The elements provided in the data cannot all be " diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4dbdd5e5b77fe..138133211685a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -14,6 +14,8 @@ import numpy as np from numpy import ma +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._typing import ( ArrayLike, @@ -289,6 +291,14 @@ def ndarray_to_mgr( if values.ndim == 1: values = values.reshape(-1, 1) + elif ( + using_copy_on_write() + and isinstance(values, np.ndarray) + and (dtype is None or is_dtype_equal(values.dtype, dtype)) + ): + values = np.array(values, order="F", copy=copy_on_sanitize) + values = _ensure_2d(values) + elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): # drop subclass info values = np.array(values, copy=copy_on_sanitize) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 303eed0b813f4..de596f0411707 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -10,6 +10,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, DataFrame, @@ -401,6 +402,7 @@ def test_set_index_preserve_categorical_dtype(self): tm.assert_frame_equal(result, df) def test_set_index_datetime(self): + pd.options.mode.copy_on_write = True # GH#3950 df = DataFrame( { From f3161a33b6624aca6c0f583f2032486a0c2f7df6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 00:46:30 +0100 Subject: [PATCH 02/21] Fix tests --- pandas/conftest.py | 5 ++++- pandas/core/frame.py | 2 ++ pandas/core/internals/construction.py | 5 +++-- pandas/tests/frame/methods/test_fillna.py | 5 ++++- pandas/tests/frame/methods/test_to_numpy.py | 10 +++++++--- pandas/tests/frame/test_constructors.py | 16 ++++++++++++---- 6 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index aab6de1724677..63028e9ca34b1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -522,7 +522,10 @@ def multiindex_dataframe_random_data( """DataFrame with 2 level MultiIndex with random data""" index = lexsorted_two_level_string_multiindex return DataFrame( - np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + copy=False, ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7e912b1be5db8..311550f98a948 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -685,6 +685,8 @@ def __init__( # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True + elif using_copy_on_write() and isinstance(data, np.ndarray): + pass else: copy = False diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 138133211685a..ca34a8cece3ae 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -257,7 +257,7 @@ def ndarray_to_mgr( # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays - copy_on_sanitize = False if typ == "array" else copy + copy_on_sanitize = False if typ == "array" or copy is None else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): @@ -295,8 +295,9 @@ def ndarray_to_mgr( using_copy_on_write() and isinstance(values, np.ndarray) and (dtype is None or is_dtype_equal(values.dtype, dtype)) + and copy is not False ): - values = np.array(values, order="F", copy=copy_on_sanitize) + values = np.array(values, order="F", copy=False if copy is None else copy) values = _ensure_2d(values) elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index f161cf7b3c525..c8b1b1165a67c 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -57,7 +57,10 @@ def test_fillna_on_column_view(self, using_copy_on_write): # i.e. we didn't create a new 49-column block assert len(df._mgr.arrays) == 1 - assert np.shares_memory(df.values, arr) + if using_copy_on_write: + assert not np.shares_memory(df.values, arr) + else: + assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): tf = datetime_frame diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 532f7c87557c8..0989356e9913a 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -23,11 +23,15 @@ def test_to_numpy_dtype(self): tm.assert_numpy_array_equal(result, expected) @td.skip_array_manager_invalid_test - def test_to_numpy_copy(self): + def test_to_numpy_copy(self, using_copy_on_write): arr = np.random.randn(4, 3) df = DataFrame(arr) - assert df.values.base is arr - assert df.to_numpy(copy=False).base is arr + if using_copy_on_write: + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is not arr + else: + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr def test_to_numpy_mixed_dtype_to_str(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 569ec613cbcb9..161995f8c3f73 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -313,11 +313,14 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_copy_on_write): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr) - assert np.shares_memory(df.values, arr) + if using_copy_on_write: + assert not np.shares_memory(df.values, arr) + else: + assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): df = DataFrame([[1, "2"], [None, "a"]], dtype=object) @@ -2107,13 +2110,18 @@ def test_constructor_frame_shallow_copy(self, float_frame): cop.index = np.arange(len(cop)) tm.assert_frame_equal(float_frame, orig) - def test_constructor_ndarray_copy(self, float_frame, using_array_manager): + def test_constructor_ndarray_copy( + self, float_frame, using_array_manager, using_copy_on_write + ): if not using_array_manager: arr = float_frame.values.copy() df = DataFrame(arr) arr[5] = 5 - assert (df.values[5] == 5).all() + if using_copy_on_write: + assert not (df.values[5] == 5).all() + else: + assert (df.values[5] == 5).all() df = DataFrame(arr, copy=True) arr[6] = 6 From 3a953113e5256c7942bd0ecf2b637a3b0a05060f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 01:34:02 +0100 Subject: [PATCH 03/21] Simplify --- pandas/core/frame.py | 2 +- pandas/core/internals/construction.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 311550f98a948..fb33f47a04876 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -686,7 +686,7 @@ def __init__( # contiguous 1D arrays copy = True elif using_copy_on_write() and isinstance(data, np.ndarray): - pass + copy = True else: copy = False diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ca34a8cece3ae..33032808008eb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -257,7 +257,7 @@ def ndarray_to_mgr( # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays - copy_on_sanitize = False if typ == "array" or copy is None else copy + copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): @@ -295,9 +295,9 @@ def ndarray_to_mgr( using_copy_on_write() and isinstance(values, np.ndarray) and (dtype is None or is_dtype_equal(values.dtype, dtype)) - and copy is not False + and copy_on_sanitize ): - values = np.array(values, order="F", copy=False if copy is None else copy) + values = np.array(values, order="F", copy=copy_on_sanitize) values = _ensure_2d(values) elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): From 17cf5ae5ebddd1b986e8e8c4ce3796c423ab4e47 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 02:28:22 +0100 Subject: [PATCH 04/21] Fix --- pandas/tests/indexing/multiindex/test_setitem.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index c890754d2d433..1f3e8a599724c 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -482,12 +482,17 @@ def test_setitem_new_column_all_na(self): @td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values # is not a view -def test_frame_setitem_view_direct(multiindex_dataframe_random_data): +def test_frame_setitem_view_direct( + multiindex_dataframe_random_data, using_copy_on_write +): # this works because we are modifying the underlying array # really a no-no df = multiindex_dataframe_random_data.T df["foo"].values[:] = 0 - assert (df["foo"].values == 0).all() + if using_copy_on_write: + assert not (df["foo"].values == 0).all() + else: + assert (df["foo"].values == 0).all() def test_frame_setitem_copy_raises( From 07aa26de999369138b97ec1a82faea26dea21584 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 11:31:24 +0100 Subject: [PATCH 05/21] Remove --- pandas/tests/frame/methods/test_set_index.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index de596f0411707..303eed0b813f4 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -10,7 +10,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Categorical, DataFrame, @@ -402,7 +401,6 @@ def test_set_index_preserve_categorical_dtype(self): tm.assert_frame_equal(result, df) def test_set_index_datetime(self): - pd.options.mode.copy_on_write = True # GH#3950 df = DataFrame( { From f3ccf0f56918ffc23b620c7f523740b525b3c17b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 10:55:40 +0100 Subject: [PATCH 06/21] Add comment --- pandas/core/construction.py | 1 + pandas/core/dtypes/cast.py | 2 ++ pandas/core/internals/construction.py | 1 + pandas/tests/frame/methods/test_fillna.py | 1 + 4 files changed, 5 insertions(+) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b7848b2ba65ce..51766839766dc 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -765,6 +765,7 @@ def _try_cast( subarr = maybe_cast_to_integer_array(arr, dtype) else: if using_copy_on_write(): + # switch to same memory layout as with blk.copy() subarr = np.array(arr, dtype=dtype, copy=copy, order="F") else: subarr = np.array(arr, dtype=dtype, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e77ec65d4ba54..225dd9a841034 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1616,6 +1616,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n DeprecationWarning, ) if using_copy_on_write(): + # switch to same memory layout as with blk.copy() casted = np.array(arr, dtype=dtype, copy=False, order="F") else: casted = np.array(arr, dtype=dtype, copy=False) @@ -1623,6 +1624,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) if using_copy_on_write(): + # switch to same memory layout as with blk.copy() casted = arr.astype(dtype, copy=False, order="F") else: casted = arr.astype(dtype, copy=False) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 33032808008eb..b7ce1fdbf9cab 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -297,6 +297,7 @@ def ndarray_to_mgr( and (dtype is None or is_dtype_equal(values.dtype, dtype)) and copy_on_sanitize ): + # switch to same memory layout as with blk.copy() values = np.array(values, order="F", copy=copy_on_sanitize) values = _ensure_2d(values) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index c8b1b1165a67c..bf764e044e5f0 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -49,6 +49,7 @@ def test_fillna_on_column_view(self, using_copy_on_write): arr = np.full((40, 50), np.nan) df = DataFrame(arr) + # TODO(CoW): This should raise a chained assignment error df[0].fillna(-1, inplace=True) if using_copy_on_write: assert np.isnan(arr[:, 0]).all() From 5cdc6ad37d430e85b26015af9a304c1a98c20425 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 11:00:18 +0100 Subject: [PATCH 07/21] Fix --- pandas/tests/frame/methods/test_fillna.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index bf764e044e5f0..d80c3c0da9935 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,7 +47,7 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) - df = DataFrame(arr) + df = DataFrame(arr, copy=False) # TODO(CoW): This should raise a chained assignment error df[0].fillna(-1, inplace=True) @@ -58,10 +58,7 @@ def test_fillna_on_column_view(self, using_copy_on_write): # i.e. we didn't create a new 49-column block assert len(df._mgr.arrays) == 1 - if using_copy_on_write: - assert not np.shares_memory(df.values, arr) - else: - assert np.shares_memory(df.values, arr) + assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): tf = datetime_frame From 3e384eafe9ff39f0276dc92e0c28a44de886b9f4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 3 Mar 2023 11:06:53 +0100 Subject: [PATCH 08/21] Change logic --- pandas/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7d5c8fed43e9e..25dcb3c7ad243 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -685,7 +685,9 @@ def __init__( # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True - elif using_copy_on_write() and isinstance(data, np.ndarray): + elif using_copy_on_write() and not isinstance( + data, (Index, DataFrame, Series) + ): copy = True else: copy = False From fcc7be278a6ff190eab705f5751e44bc76258815 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Mar 2023 00:41:13 +0100 Subject: [PATCH 09/21] Fix tests --- pandas/tests/frame/methods/test_transpose.py | 7 +++++-- pandas/tests/frame/methods/test_values.py | 18 ++++++++++++------ pandas/tests/frame/test_constructors.py | 7 +++++-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 6213a6dbbd0ca..9584ed18183e8 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -117,7 +117,7 @@ def test_transpose_get_view(self, float_frame): assert (float_frame.values[5:10] == 5).all() @td.skip_array_manager_invalid_test - def test_transpose_get_view_dt64tzget_view(self): + def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") arr = dti._data.reshape(3, 2) df = DataFrame(arr) @@ -127,4 +127,7 @@ def test_transpose_get_view_dt64tzget_view(self): assert result._mgr.nblocks == 1 rtrip = result._mgr.blocks[0].values - assert np.shares_memory(arr._ndarray, rtrip._ndarray) + if using_copy_on_write: + assert not np.shares_memory(arr._ndarray, rtrip._ndarray) + else: + assert np.shares_memory(arr._ndarray, rtrip._ndarray) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 134534e3c8f8c..2ba4c2fa642f4 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -225,14 +225,17 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: @td.skip_array_manager_invalid_test - def test_private_values_dt64tz(self): + def test_private_values_dt64tz(self, using_copy_on_write): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta @@ -240,14 +243,17 @@ def test_private_values_dt64tz(self): tm.assert_equal(df2._values, tda) @td.skip_array_manager_invalid_test - def test_private_values_dt64tz_multicol(self): + def test_private_values_dt64tz_multicol(self, using_copy_on_write): dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) df = DataFrame(dta, columns=["A", "B"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 161995f8c3f73..1b508d7e3087a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -306,11 +306,14 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_copy_on_write): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr) - assert np.shares_memory(df.values, arr) + if using_copy_on_write: + assert not np.shares_memory(df.values, arr) + else: + assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test def test_2d_object_array_does_not_copy(self, using_copy_on_write): From 92238364137b66c8429a4e707cce07f1461e967c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Mar 2023 00:44:44 +0100 Subject: [PATCH 10/21] Fix test --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 63028e9ca34b1..297d83574eacd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -525,7 +525,6 @@ def multiindex_dataframe_random_data( np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp"), - copy=False, ) From a474bf5efaae8dea19c8ddf3586d0f8c022bcafb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 4 Mar 2023 00:45:13 +0100 Subject: [PATCH 11/21] Fix test --- pandas/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 297d83574eacd..aab6de1724677 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -522,9 +522,7 @@ def multiindex_dataframe_random_data( """DataFrame with 2 level MultiIndex with random data""" index = lexsorted_two_level_string_multiindex return DataFrame( - np.random.randn(10, 3), - index=index, - columns=Index(["A", "B", "C"], name="exp"), + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") ) From 293f8a5d030a3208c61fac0d24c401c72651e09f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 21:48:18 +0100 Subject: [PATCH 12/21] Remove order changes --- pandas/conftest.py | 1 + pandas/core/construction.py | 8 +------- pandas/core/dtypes/cast.py | 14 ++------------ pandas/core/internals/construction.py | 3 +-- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index aab6de1724677..a1ad1abc65dff 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1919,6 +1919,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 51766839766dc..8c5f291742b9b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -18,8 +18,6 @@ import numpy as np from numpy import ma -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas._libs.tslibs.period import Period from pandas._typing import ( @@ -764,10 +762,6 @@ def _try_cast( subarr = maybe_cast_to_integer_array(arr, dtype) else: - if using_copy_on_write(): - # switch to same memory layout as with blk.copy() - subarr = np.array(arr, dtype=dtype, copy=copy, order="F") - else: - subarr = np.array(arr, dtype=dtype, copy=copy) + subarr = np.array(arr, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index cfbae96f75e26..cfcae7f40919a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -19,8 +19,6 @@ import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -1616,19 +1614,11 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n "NumPy will stop allowing conversion of out-of-bound Python int", DeprecationWarning, ) - if using_copy_on_write(): - # switch to same memory layout as with blk.copy() - casted = np.array(arr, dtype=dtype, copy=False, order="F") - else: - casted = np.array(arr, dtype=dtype, copy=False) + casted = np.array(arr, dtype=dtype, copy=False) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) - if using_copy_on_write(): - # switch to same memory layout as with blk.copy() - casted = arr.astype(dtype, copy=False, order="F") - else: - casted = arr.astype(dtype, copy=False) + casted = arr.astype(dtype, copy=False) except OverflowError as err: raise OverflowError( "The elements provided in the data cannot all be " diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bbf3d6a5e4d3f..5b607d6eceefa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -299,8 +299,7 @@ def ndarray_to_mgr( and (dtype is None or is_dtype_equal(values.dtype, dtype)) and copy_on_sanitize ): - # switch to same memory layout as with blk.copy() - values = np.array(values, order="F", copy=copy_on_sanitize) + values = np.array(values, copy=copy_on_sanitize) values = _ensure_2d(values) elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): From 265d9e3a128d76d464be25316806541ccf421e9f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 21:52:44 +0100 Subject: [PATCH 13/21] Update --- pandas/conftest.py | 1 - pandas/core/frame.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index fcf74bbf796dd..95bb2078d151c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1921,7 +1921,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 64e6bcc949683..3814323868bf5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3842,7 +3842,7 @@ def _getitem_multilevel(self, key): else: new_values = self._values[:, loc] result = self._constructor( - new_values, index=self.index, columns=result_columns + new_values, index=self.index, columns=result_columns, copy=False ) if using_copy_on_write() and isinstance(loc, slice): result._mgr.add_references(self._mgr) # type: ignore[arg-type] From 9ac1bae90ab7cfa9bf709e20449be6a48655446d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 21:59:18 +0100 Subject: [PATCH 14/21] Address review --- pandas/tests/frame/methods/test_to_numpy.py | 7 +++++-- pandas/tests/frame/methods/test_transpose.py | 2 +- pandas/tests/frame/test_constructors.py | 18 ++++++------------ 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index f127670574cc4..ae0eafb0bf348 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -28,7 +28,7 @@ def test_to_numpy_copy(self, using_copy_on_write): df = DataFrame(arr) if using_copy_on_write: assert df.values.base is not arr - assert df.to_numpy(copy=False).base is not arr + assert df.to_numpy(copy=False).base is df.values.base else: assert df.values.base is arr assert df.to_numpy(copy=False).base is arr @@ -36,7 +36,10 @@ def test_to_numpy_copy(self, using_copy_on_write): # we still don't want a copy when na_value=np.nan is passed, # and that can be respected because we are already numpy-float - assert df.to_numpy(copy=False, na_value=np.nan).base is arr + if using_copy_on_write: + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.to_numpy(copy=False, na_value=np.nan).base is arr def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index ccb1914afc75e..8ff6ea37eae18 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -133,7 +133,7 @@ def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): rtrip = result._mgr.blocks[0].values if using_copy_on_write: - assert not np.shares_memory(arr._ndarray, rtrip._ndarray) + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) else: assert np.shares_memory(arr._ndarray, rtrip._ndarray) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0f5ecf2080bb0..db7e97d672b4d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -306,24 +306,18 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - def test_1d_object_array_does_not_copy(self, using_copy_on_write): + def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") - df = DataFrame(arr) - if using_copy_on_write: - assert not np.shares_memory(df.values, arr) - else: - assert np.shares_memory(df.values, arr) + df = DataFrame(arr, copy=False) + assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - def test_2d_object_array_does_not_copy(self, using_copy_on_write): + def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(arr) - if using_copy_on_write: - assert not np.shares_memory(df.values, arr) - else: - assert np.shares_memory(df.values, arr) + df = DataFrame(arr, copy=False) + assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): df = DataFrame([[1, "2"], [None, "a"]], dtype=object) From db92ce4ebe0e0d470d0b504bbf03485887141cb9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 22:02:00 +0100 Subject: [PATCH 15/21] Add test --- pandas/tests/copy_view/test_constructors.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6cf45c194707e..6abaec0b89ef4 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -179,3 +179,14 @@ def test_dataframe_from_dict_of_series_with_dtype(index): df.iloc[0, 0] = 100 arr_after = get_array(df, "a") assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("copy", [False, None, True]) +def test_frame_from_numpy_array(using_copy_on_write, copy): + arr = np.array([[1, 2], [3, 4]]) + df = DataFrame(arr, copy=copy) + + if using_copy_on_write and copy is not False or copy is True: + assert not np.shares_memory(get_array(df, 0), arr) + else: + assert np.shares_memory(get_array(df, 0), arr) From be9cb0452e7421bca5a0b21bc816f2a9dff56308 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 14:34:11 +0100 Subject: [PATCH 16/21] Fix test --- pandas/tests/indexing/multiindex/test_setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index e6c298f87c072..27b4a4be73032 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -493,6 +493,7 @@ def test_frame_setitem_view_direct( df["foo"].values[:] = 0 assert (df["foo"].values != 0).all() else: + df["foo"].values[:] = 0 assert (df["foo"].values == 0).all() From e2eceec7f02e3542324fc1c6e8b3d94eb061162e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 17:09:46 +0100 Subject: [PATCH 17/21] Fix array manager --- pandas/tests/copy_view/test_constructors.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6abaec0b89ef4..706600f5aef38 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -182,11 +182,17 @@ def test_dataframe_from_dict_of_series_with_dtype(index): @pytest.mark.parametrize("copy", [False, None, True]) -def test_frame_from_numpy_array(using_copy_on_write, copy): +def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): arr = np.array([[1, 2], [3, 4]]) df = DataFrame(arr, copy=copy) - if using_copy_on_write and copy is not False or copy is True: + if ( + using_copy_on_write + and copy is not False + or copy is True + or using_array_manager + and copy is None + ): assert not np.shares_memory(get_array(df, 0), arr) else: assert np.shares_memory(get_array(df, 0), arr) From 65965c684eb981c3015122a13488546a9de13407 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 21:09:55 +0100 Subject: [PATCH 18/21] Remove elif --- pandas/core/internals/construction.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8b009bdfe3a32..b114b8a1aa7aa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -15,8 +15,6 @@ import numpy as np from numpy import ma -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas.core.dtypes.cast import ( @@ -293,15 +291,6 @@ def ndarray_to_mgr( if values.ndim == 1: values = values.reshape(-1, 1) - elif ( - using_copy_on_write() - and isinstance(values, np.ndarray) - and (dtype is None or is_dtype_equal(values.dtype, dtype)) - and copy_on_sanitize - ): - values = np.array(values, copy=copy_on_sanitize) - values = _ensure_2d(values) - elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): # drop subclass info values = np.array(values, copy=copy_on_sanitize) From ecb756c4b9675a0077864225ebdec4a8f4e261d2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 15 Mar 2023 21:14:38 +0100 Subject: [PATCH 19/21] Update pandas/tests/copy_view/test_constructors.py Co-authored-by: Joris Van den Bossche --- pandas/tests/copy_view/test_constructors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 706600f5aef38..998849ba4794b 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -190,8 +190,7 @@ def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): using_copy_on_write and copy is not False or copy is True - or using_array_manager - and copy is None + or (using_array_manager and copy is None) ): assert not np.shares_memory(get_array(df, 0), arr) else: From 177fbbcfd8662cd8c8546850a37ddc5eeb1109ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 21:16:21 +0100 Subject: [PATCH 20/21] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c927dc2ac4a96..6fb1ff51216cb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -190,6 +190,10 @@ Copy-on-Write improvements of Series objects and specifying ``copy=False``, will now use a lazy copy of those Series objects for the columns of the DataFrame (:issue:`50777`) +- The :class:`DataFrame` constructor, when constructing from a NumPy array, + will now copy the array by default to avoid mutating the :class:`DataFrame` + when mutating the array. Specify ``copy=False`` to get the old behavior. + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary From 5bef4ba3893b6d5a6d4ab18555517f4e93f48ed2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Mar 2023 16:35:25 +0100 Subject: [PATCH 21/21] Add note --- doc/source/whatsnew/v2.0.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d3ac2800d659b..25ff4d6e7d7e1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -193,6 +193,9 @@ Copy-on-Write improvements - The :class:`DataFrame` constructor, when constructing from a NumPy array, will now copy the array by default to avoid mutating the :class:`DataFrame` when mutating the array. Specify ``copy=False`` to get the old behavior. + When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write + behavior when the NumPy array is modified after creation of the + :class:`DataFrame`. - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode,