From 692af751df76d4e78a12c191a5e6129cd2733a2a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 16:52:01 +0100 Subject: [PATCH 1/4] ENH: Keep track of references in DataFrame constructor for manager of df --- doc/source/whatsnew/v2.0.0.rst | 3 +++ pandas/core/frame.py | 2 ++ pandas/core/reshape/concat.py | 2 +- pandas/tests/copy_view/test_constructors.py | 27 ++++++++++++++++++++- pandas/tests/frame/test_constructors.py | 4 +-- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 70c60401f29fb..42105726d3e7e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -238,6 +238,9 @@ Copy-on-Write improvements a modification to the data happens) when constructing a Series from an existing Series with the default of ``copy=False`` (:issue:`50471`) +- The :class:`DataFrame` constructor now will keep track of references when called + with another :class:`DataFrame` or ``BlockManager``. + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an exception when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2361c254f5161..9781ce3a90f6e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -654,6 +654,8 @@ def __init__( data = data._mgr if isinstance(data, (BlockManager, ArrayManager)): + if using_copy_on_write(): + data = data.copy(deep=False) # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) if index is None and columns is None and dtype is None and not copy: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 43e0b77a90a85..67eb80cb0dcc8 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -548,7 +548,7 @@ def __init__( # mypy needs to know sample is not an NDFrame sample = cast("DataFrame | Series", sample) obj = sample._constructor(obj, columns=[name], copy=False) - if using_copy_on_write(): + if isinstance(original_obj, ABCSeries) and using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works for i, block in enumerate(original_obj._mgr.blocks): # type: ignore[union-attr] # noqa obj._mgr.blocks[i].refs = block.refs # type: ignore[union-attr] # noqa diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index f5805455e326f..4dff236cbdb09 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,12 @@ import numpy as np +import pytest -from pandas import Series +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Copy/view behaviour for Series / DataFrame constructors @@ -73,3 +79,22 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not np.shares_memory(ser.values, result.values) if using_copy_on_write: assert not result._mgr.blocks[0].refs.has_reference() + + +@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) +@pytest.mark.parametrize("columns", [None, ["a"]]) +def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + + new_df = DataFrame(func(df)) + + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + new_df.iloc[0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, new_df) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e009ba45514a2..9db4cb53c2013 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -285,10 +285,8 @@ def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) if using_copy_on_write: - # TODO(CoW) doesn't mutate original should_be_view.iloc[0, 0] = 99 - # assert df.values[0, 0] == 1 - assert df.values[0, 0] == 99 + assert df.values[0, 0] == 1 else: should_be_view[0][0] = 99 assert df.values[0, 0] == 99 From b1f20aab74fee560a50acc96a98c203d5c9aece3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 10 Feb 2023 17:30:41 +0100 Subject: [PATCH 2/4] Update v2.0.0.rst --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 38e0314e5314d..e27a669d4a9b6 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -244,7 +244,7 @@ Copy-on-Write improvements Series with the default of ``copy=False`` (:issue:`50471`) - The :class:`DataFrame` constructor now will keep track of references when called - with another :class:`DataFrame` or ``BlockManager``. + with another :class:`DataFrame`. - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an exception when Copy-on-Write is enabled. In this mode, From dd40d8cb6c7e78cdb995e853f4a6a436148a4796 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Feb 2023 22:27:15 +0100 Subject: [PATCH 3/4] Adjust whatsnew --- doc/source/whatsnew/v2.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 70fe1f6d1d136..685dacf51f970 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -243,8 +243,8 @@ Copy-on-Write improvements a modification to the data happens) when constructing a Series from an existing Series with the default of ``copy=False`` (:issue:`50471`) -- The :class:`DataFrame` constructor now will keep track of references when called - with another :class:`DataFrame`. +- The :class:`DataFrame` constructor now will now respect the Copy-on-Write mechanism + when called with another :class:`DataFrame`. - The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary of Series objects and specifying ``copy=False``, will now use a lazy copy From e090f8a0529c51245f7dc8bb53ffd748b27bcb4b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Feb 2023 22:29:12 +0100 Subject: [PATCH 4/4] Adjust whatsnew --- doc/source/whatsnew/v2.0.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 685dacf51f970..29b597bf7a95c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -243,8 +243,9 @@ Copy-on-Write improvements a modification to the data happens) when constructing a Series from an existing Series with the default of ``copy=False`` (:issue:`50471`) -- The :class:`DataFrame` constructor now will now respect the Copy-on-Write mechanism - when called with another :class:`DataFrame`. +- The :class:`DataFrame` constructor will now create a lazy copy (deferring the copy until + a modification to the data happens) when constructing from an existing + :class:`DataFrame` with the default of ``copy=False`` (:issue:`51239`) - The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary of Series objects and specifying ``copy=False``, will now use a lazy copy