From 3303b7d2a6e98e83259ed8a17e63b50991d0b803 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Nov 2023 22:57:57 +0100 Subject: [PATCH 1/6] CoW: Avoid warning in apply for mixed dtype frame --- pandas/conftest.py | 1 + pandas/core/apply.py | 6 ++++++ pandas/tests/apply/test_invalid_arg.py | 8 ------- pandas/tests/copy_view/test_methods.py | 30 ++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3205b6657439f..5415d8cf97597 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1920,6 +1920,7 @@ def warn_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = "warn" return ( pd.options.mode.copy_on_write == "warn" and _get_option("mode.data_manager", silent=True) == "block" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index bb3cc3a03760f..3e85ab385c5f5 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -20,6 +20,7 @@ from pandas._config import option_context from pandas._libs import lib +from pandas._libs.internals import BlockValuesRefs from pandas._typing import ( AggFuncType, AggFuncTypeBase, @@ -1254,6 +1255,8 @@ def series_generator(self) -> Generator[Series, None, None]: ser = self.obj._ixs(0, axis=0) mgr = ser._mgr + is_view = mgr.blocks[0].refs.has_reference() + if isinstance(ser.dtype, ExtensionDtype): # values will be incorrect for this block # TODO(EA2D): special case would be unnecessary with 2D EAs @@ -1267,6 +1270,9 @@ def series_generator(self) -> Generator[Series, None, None]: ser._mgr = mgr mgr.set_values(arr) object.__setattr__(ser, "_name", name) + if not is_view: + # You really shouldn't do this... + mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) yield ser @staticmethod diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 9f8611dd4b08b..5b661845e1738 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -18,7 +18,6 @@ DataFrame, Series, date_range, - notna, ) import pandas._testing as tm @@ -150,8 +149,6 @@ def test_transform_axis_1_raises(): Series([1]).transform("sum", axis=1) -# TODO(CoW-warn) should not need to warn -@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_apply_modify_traceback(): data = DataFrame( { @@ -207,11 +204,6 @@ def transform(row): row["D"] = 7 return row - def transform2(row): - if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row - msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): data.apply(transform, axis=1) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 806842dcab57a..044e69c98ca8b 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -2002,3 +2002,33 @@ def test_eval_inplace(using_copy_on_write, warn_copy_on_write): df.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_frame_equal(df_view, df_orig) + + +def test_apply_warning(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + df_orig = df.copy() + + def transform(row): + row["B"] = 100 + return row + + if using_copy_on_write: + df.apply(transform, axis=1) + tm.assert_frame_equal(df, df_orig) + elif warn_copy_on_write: + with tm.assert_produces_warning(FutureWarning, match="Setting a value"): + df.apply(transform, axis=1) + + df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) + df_orig = df.copy() + + def transform(row): + row["A"] = 100 + return row + + if using_copy_on_write: + df.apply(transform, axis=1) + tm.assert_frame_equal(df, df_orig) + elif warn_copy_on_write: + with tm.assert_produces_warning(None): + df.apply(transform, axis=1) From 2e579ba1361e05cd2ad75be9a4c3a5b43ff30ccc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Nov 2023 22:58:24 +0100 Subject: [PATCH 2/6] CoW: Avoid warning in apply for mixed dtype frame --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5415d8cf97597..3205b6657439f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1920,7 +1920,6 @@ def warn_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = "warn" return ( pd.options.mode.copy_on_write == "warn" and _get_option("mode.data_manager", silent=True) == "block" From 9560245fb5bdf8c8f5181bb249f9f491a4a488e2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Nov 2023 23:42:28 +0100 Subject: [PATCH 3/6] CoW: Avoid warning in apply for mixed dtype frame --- pandas/core/apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3e85ab385c5f5..adaaeb3f3b767 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1255,7 +1255,7 @@ def series_generator(self) -> Generator[Series, None, None]: ser = self.obj._ixs(0, axis=0) mgr = ser._mgr - is_view = mgr.blocks[0].refs.has_reference() + is_view = mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] if isinstance(ser.dtype, ExtensionDtype): # values will be incorrect for this block @@ -1272,7 +1272,7 @@ def series_generator(self) -> Generator[Series, None, None]: object.__setattr__(ser, "_name", name) if not is_view: # You really shouldn't do this... - mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) + mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr] yield ser @staticmethod From 82bf5cb7c0c113393499efa19c0d2613fbdfc775 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 28 Nov 2023 00:20:11 +0100 Subject: [PATCH 4/6] Update --- pandas/tests/copy_view/test_methods.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 044e69c98ca8b..698e4f7dbecca 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -2022,10 +2022,6 @@ def transform(row): df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) df_orig = df.copy() - def transform(row): - row["A"] = 100 - return row - if using_copy_on_write: df.apply(transform, axis=1) tm.assert_frame_equal(df, df_orig) From 5b8d0087dbe0abb67dd281885406d130a603b67f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 28 Nov 2023 20:52:31 +0100 Subject: [PATCH 5/6] Fixup --- pandas/tests/apply/test_invalid_arg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 051eaa5ef019d..5b661845e1738 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -206,8 +206,7 @@ def transform(row): msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): - with tm.assert_cow_warning(warn_copy_on_write): - data.apply(transform, axis=1) + data.apply(transform, axis=1) @pytest.mark.parametrize( From 5065b780c33fe383060e065b5b267c52d9f41b23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Dec 2023 15:49:15 +0100 Subject: [PATCH 6/6] add more comments + rename test --- pandas/core/apply.py | 7 ++++++- pandas/core/internals/managers.py | 8 ++++---- pandas/tests/copy_view/test_methods.py | 22 ++++++++++++---------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index adaaeb3f3b767..169a44accf066 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1271,7 +1271,12 @@ def series_generator(self) -> Generator[Series, None, None]: mgr.set_values(arr) object.__setattr__(ser, "_name", name) if not is_view: - # You really shouldn't do this... + # In apply_series_generator we store the a shallow copy of the + # result, which potentially increases the ref count of this reused + # `ser` object (depending on the result of the applied function) + # -> if that happened and `ser` is already a copy, then we reset + # the refs here to avoid triggering a unnecessary CoW inside the + # applied function (https://github.com/pandas-dev/pandas/pull/56212) mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr] yield ser diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a02f31d4483b2..9d2a1e634eea2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2071,11 +2071,11 @@ def set_values(self, values: ArrayLike) -> None: Set the values of the single block in place. Use at your own risk! This does not check if the passed values are - valid for the current Block/SingleBlockManager (length, dtype, etc). + valid for the current Block/SingleBlockManager (length, dtype, etc), + and this does not properly keep track of references. """ - # TODO(CoW) do we need to handle copy on write here? Currently this is - # only used for FrameColumnApply.series_generator (what if apply is - # mutating inplace?) + # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator + # which handles CoW by setting the refs manually if necessary self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f8f1d8d723255..84b2c59bda5d9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -2015,7 +2015,9 @@ def test_eval_inplace(using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df_view, df_orig) -def test_apply_warning(using_copy_on_write, warn_copy_on_write): +def test_apply_modify_row(using_copy_on_write, warn_copy_on_write): + # Case: applying a function on each row as a Series object, where the + # function mutates the row object (which needs to trigger CoW if row is a view) df = DataFrame({"A": [1, 2], "B": [3, 4]}) df_orig = df.copy() @@ -2023,19 +2025,19 @@ def transform(row): row["B"] = 100 return row - if using_copy_on_write: + with tm.assert_cow_warning(warn_copy_on_write): df.apply(transform, axis=1) + + if using_copy_on_write: tm.assert_frame_equal(df, df_orig) - elif warn_copy_on_write: - with tm.assert_produces_warning(FutureWarning, match="Setting a value"): - df.apply(transform, axis=1) + else: + assert df.loc[0, "B"] == 100 + # row Series is a copy df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) df_orig = df.copy() - if using_copy_on_write: + with tm.assert_produces_warning(None): df.apply(transform, axis=1) - tm.assert_frame_equal(df, df_orig) - elif warn_copy_on_write: - with tm.assert_produces_warning(None): - df.apply(transform, axis=1) + + tm.assert_frame_equal(df, df_orig)