From 2ff866728447d03bfa34a67152b0ce06927b54fb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Nov 2021 16:51:08 +0100 Subject: [PATCH 01/10] Revert "BUG: ArrayManager construction unwanted copy (#42689)" This reverts commit cb3b4e4a0e9ae350b6e1dc2e42f27ec9fd23cdb7. --- pandas/core/internals/construction.py | 4 ++-- pandas/tests/frame/methods/test_values.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a766f8321a641..a25b6959605d4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -349,14 +349,14 @@ def ndarray_to_mgr( if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( - maybe_infer_to_datetimelike(values[:, i]) + maybe_infer_to_datetimelike(values[:, i].copy()) ) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) - arrays = [values[:, i] for i in range(values.shape[1])] + arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 477099fba75e1..2ff991b62b67e 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -226,7 +226,10 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: - def test_private_values_dt64tz(self, request): + def test_private_values_dt64tz(self, using_array_manager, request): + if using_array_manager: + mark = pytest.mark.xfail(reason="doesn't share memory") + request.node.add_marker(mark) dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) From 03e568efd1defd990fc488a88577831650ff9999 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Nov 2021 17:07:46 +0100 Subject: [PATCH 02/10] add comment and test --- pandas/core/internals/construction.py | 1 + pandas/tests/frame/test_constructors.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a25b6959605d4..9a66f31a252d3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -356,6 +356,7 @@ def ndarray_to_mgr( else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) + # copy the array to ensure contiguous memory arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6ec2b243d540a..3f0114eefee32 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -256,12 +256,17 @@ def test_constructor_dtype_nocast_view_dataframe(self): should_be_view[0][0] = 99 assert df.values[0, 0] == 99 - @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? - def test_constructor_dtype_nocast_view_2d_array(self): - df = DataFrame([[1, 2]]) - should_be_view = DataFrame(df.values, dtype=df[0].dtype) - should_be_view[0][0] = 97 - assert df.values[0, 0] == 97 + def test_constructor_dtype_nocast_view_2d_array(self, using_array_manager): + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + if not using_array_manager: + should_be_view = DataFrame(df.values, dtype=df[0].dtype) + should_be_view[0][0] = 97 + assert df.values[0, 0] == 97 + else: + # INFO(ArrayManager) DataFrame(ndarray) doesn't necessarily preserve + # a view on the array to ensure contiguous 1D arrays + df2 = DataFrame(df.values, dtype=df[0].dtype) + assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test def test_1d_object_array_does_not_copy(self): From a515fd134f8f2b807aa0e8203c4e1f8f5057b7bb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Nov 2021 08:41:06 +0100 Subject: [PATCH 03/10] control through copy=None keyword --- pandas/core/frame.py | 18 +++++++++++------- pandas/core/internals/construction.py | 13 +++++++++---- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0960ab4a81149..79f1733cea22c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -593,13 +593,6 @@ def __init__( copy: bool | None = None, ): - if copy is None: - if isinstance(data, dict) or data is None: - # retain pre-GH#38939 default behavior - copy = True - else: - copy = False - if data is None: data = {} if dtype is not None: @@ -618,6 +611,17 @@ def __init__( manager = get_option("mode.data_manager") + if copy is None: + if isinstance(data, dict) or data is None: + # retain pre-GH#38939 default behavior + copy = True + elif manager == "array" and isinstance(data, np.ndarray): + # INFO(ArrayManager) by default copy the 2D input array to get + # contiguous 1D arrays + copy = True + else: + copy = False + if isinstance(data, (BlockManager, ArrayManager)): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 9a66f31a252d3..bda0e0f72c900 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -319,7 +319,7 @@ def ndarray_to_mgr( else: # by definition an array here # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=copy) + values = _prep_ndarray(values, copy=False if typ == "array" else copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape @@ -328,8 +328,9 @@ def ndarray_to_mgr( # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") + copy_on_sanitize = False if typ == "array" else copy values = sanitize_array( - flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf + flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf ) values = values.reshape(shape) @@ -353,11 +354,15 @@ def ndarray_to_mgr( ) for i in range(values.shape[1]) ] + if copy: + arrays = [arr.copy() for arr in arrays] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) - # copy the array to ensure contiguous memory - arrays = [values[:, i].copy() for i in range(values.shape[1])] + if copy: + arrays = [values[:, i].copy() for i in range(values.shape[1])] + else: + arrays = [values[:, i] for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) From 2ac744e90477ea2a7731b0143bd5c7244f15b5e0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Nov 2021 09:32:14 +0100 Subject: [PATCH 04/10] add explicit test --- pandas/tests/frame/test_constructors.py | 30 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3f0114eefee32..f55a7659f2b9a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2108,17 +2108,29 @@ def test_constructor_frame_copy(self, float_frame): assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() - # TODO(ArrayManager) keep view on 2D array? - @td.skip_array_manager_not_yet_implemented - def test_constructor_ndarray_copy(self, float_frame): - df = DataFrame(float_frame.values) + def test_constructor_ndarray_copy(self, float_frame, using_array_manager): + if not using_array_manager: + df = DataFrame(float_frame.values) - float_frame.values[5] = 5 - assert (df.values[5] == 5).all() + float_frame.values[5] = 5 + assert (df.values[5] == 5).all() - df = DataFrame(float_frame.values, copy=True) - float_frame.values[6] = 6 - assert not (df.values[6] == 6).all() + df = DataFrame(float_frame.values, copy=True) + float_frame.values[6] = 6 + assert not (df.values[6] == 6).all() + else: + arr = float_frame.values.copy() + # default: copy to ensure contiguous arrays + df = DataFrame(arr) + assert df._mgr.arrays[0].flags.c_contiguous + arr[0, 0] = 100 + assert df.iloc[0, 0] != 100 + + # manually specify copy=False + df = DataFrame(arr, copy=False) + assert not df._mgr.arrays[0].flags.c_contiguous + arr[0, 0] = 1000 + assert df.iloc[0, 0] == 1000 # TODO(ArrayManager) keep view on Series? @td.skip_array_manager_not_yet_implemented From 53a744ad10be6559c2c10574465b55114a6ce80f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Nov 2021 17:58:07 +0100 Subject: [PATCH 05/10] limit to 2D array input --- pandas/core/frame.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79f1733cea22c..d93cc3e7794a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -615,7 +615,11 @@ def __init__( if isinstance(data, dict) or data is None: # retain pre-GH#38939 default behavior copy = True - elif manager == "array" and isinstance(data, np.ndarray): + elif ( + manager == "array" + and isinstance(data, (np.ndarray, ExtensionArray)) + and data.ndim == 2 + ): # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True From 1bb6af53a51d9067cf1c00f55b69fab90dad8b44 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Nov 2021 15:53:59 +0100 Subject: [PATCH 06/10] update test_private_values_dt64tz --- pandas/tests/frame/methods/test_values.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 2ff991b62b67e..25f0382696e18 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -226,11 +226,8 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: + @td.skip_array_manager_invalid_test def test_private_values_dt64tz(self, using_array_manager, request): - if using_array_manager: - mark = pytest.mark.xfail(reason="doesn't share memory") - request.node.add_marker(mark) - dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"]) From 7e96eae378f0197cd5758be263b48e8a322f0814 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Nov 2021 17:01:50 +0100 Subject: [PATCH 07/10] copy after if/else --- pandas/core/internals/construction.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index bda0e0f72c900..daefdc57117e9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -354,15 +354,13 @@ def ndarray_to_mgr( ) for i in range(values.shape[1]) ] - if copy: - arrays = [arr.copy() for arr in arrays] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) - if copy: - arrays = [values[:, i].copy() for i in range(values.shape[1])] - else: - arrays = [values[:, i] for i in range(values.shape[1])] + arrays = [values[:, i] for i in range(values.shape[1])] + + if copy: + arrays = [arr.copy() for arr in arrays] return ArrayManager(arrays, [index, columns], verify_integrity=False) From 655b8ad9bf56eed71e5f9264e8c044af41320901 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Nov 2021 17:03:32 +0100 Subject: [PATCH 08/10] remove additional copy --- pandas/core/internals/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index daefdc57117e9..f55591ca52468 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -350,7 +350,7 @@ def ndarray_to_mgr( if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( - maybe_infer_to_datetimelike(values[:, i].copy()) + maybe_infer_to_datetimelike(values[:, i]) ) for i in range(values.shape[1]) ] From 1a69f98c82f074f4994ae3c939bc49eeba78df93 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Nov 2021 20:29:31 +0100 Subject: [PATCH 09/10] feedback --- pandas/core/frame.py | 2 +- pandas/core/internals/construction.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69336cc8f7069..70cdc3dbd7107 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -612,7 +612,7 @@ def __init__( manager = get_option("mode.data_manager") if copy is None: - if isinstance(data, dict) or data is None: + if isinstance(data, dict): # retain pre-GH#38939 default behavior copy = True elif ( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f55591ca52468..487d7c1090ec8 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -290,6 +290,10 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) + # if the array preparation does a copy -> avoid this for ArrayManager, + # since the copy is done on conversion to 1D arrays + copy_on_sanitize = False if typ == "array" else copy + vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 @@ -319,7 +323,7 @@ def ndarray_to_mgr( else: # by definition an array here # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=False if typ == "array" else copy) + values = _prep_ndarray(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape @@ -328,7 +332,6 @@ def ndarray_to_mgr( # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") - copy_on_sanitize = False if typ == "array" else copy values = sanitize_array( flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf ) From a4f76bfcf6f20d0b97aeaf653f232727f09f0950 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Nov 2021 22:52:29 +0100 Subject: [PATCH 10/10] remove unused test arguments --- pandas/tests/frame/methods/test_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 25f0382696e18..f755b0addfd6d 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -227,7 +227,7 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: @td.skip_array_manager_invalid_test - def test_private_values_dt64tz(self, using_array_manager, request): + def test_private_values_dt64tz(self): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"])