From 2cef7995f7a657378b68b09d0dbd59f908d52987 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 22:31:00 +0100 Subject: [PATCH 01/12] ENH: Implement CoW for interpolate --- pandas/core/internals/blocks.py | 48 +++++++++++++++++++++++-------- pandas/core/internals/managers.py | 11 ++----- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 115ae5dc6bb9d..a37570da75d13 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -225,7 +225,10 @@ def make_block( @final def make_block_same_class( - self, values, placement: BlockPlacement | None = None + self, + values, + placement: BlockPlacement | None = None, + refs: BlockValuesRefs | None = None, ) -> Block: """Wrap given values in a block of same type as self.""" # Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet @@ -234,7 +237,7 @@ def make_block_same_class( placement = self._mgr_locs # We assume maybe_coerce_values has already been called - return type(self)(values, placement=placement, ndim=self.ndim) + return type(self)(values, placement=placement, ndim=self.ndim, refs=refs) @final def __repr__(self) -> str: @@ -418,7 +421,9 @@ def coerce_to_target_dtype(self, other) -> Block: return self.astype(new_dtype, copy=False) @final - def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: + def _maybe_downcast( + self, blocks: list[Block], downcast=None, using_cow: bool = False + ) -> list[Block]: if downcast is False: return blocks @@ -433,18 +438,19 @@ def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: if downcast is None: return blocks - return extend_blocks([b._downcast_2d(downcast) for b in blocks]) + return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) @final @maybe_split - def _downcast_2d(self, dtype) -> list[Block]: + def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: """ downcast specialized to 2D case post-validation. Refactored to allow use of maybe_split. """ new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) - return [self.make_block(new_values)] + refs = self.refs if using_cow and new_values is self.values else None + return [self.make_block(new_values, refs=refs)] def convert( self, @@ -1195,6 +1201,7 @@ def interpolate( limit_area: str | None = None, fill_value: Any | None = None, downcast: str | None = None, + using_cow: bool = False, **kwargs, ) -> list[Block]: @@ -1202,6 +1209,8 @@ def interpolate( if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] try: @@ -1210,8 +1219,10 @@ def interpolate( m = None if m is None and self.dtype.kind != "f": # only deal with floats - # bc we already checked that can_hold_na, we dont have int dtype here + # bc we already checked that can_hold_na, we don't have int dtype here # test_interp_basic checks that we make a copy here + if using_cow: + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0: @@ -1230,7 +1241,15 @@ def interpolate( **kwargs, ) - data = self.values if inplace else self.values.copy() + refs = None + if inplace: + if using_cow and self.refs.has_reference(): + data = self.values.copy() + else: + data = self.values + refs = self.refs + else: + data = self.values.copy() data = cast(np.ndarray, data) # bc overridden by ExtensionBlock missing.interpolate_array_2d( @@ -1245,8 +1264,8 @@ def interpolate( **kwargs, ) - nb = self.make_block_same_class(data) - return nb._maybe_downcast([nb], downcast) + nb = self.make_block_same_class(data, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: """return block for the diff of the values""" @@ -1618,6 +1637,7 @@ def interpolate( inplace: bool = False, limit: int | None = None, fill_value=None, + using_cow: bool = False, **kwargs, ): values = self.values @@ -1997,6 +2017,7 @@ def interpolate( inplace: bool = False, limit: int | None = None, fill_value=None, + using_cow: bool = False, **kwargs, ): values = self.values @@ -2006,12 +2027,15 @@ def interpolate( # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs - data_out = values._ndarray if inplace else values._ndarray.copy() + if using_cow: + data_out = values._ndarray + else: + data_out = values._ndarray if inplace else values._ndarray.copy() missing.interpolate_array_2d( data_out, method=method, limit=limit, index=index, axis=axis ) new_values = type(values)._simple_new(data_out, dtype=values.dtype) - return self.make_block_same_class(new_values) + return self.make_block_same_class(new_values, refs=self.refs) elif values.ndim == 2 and axis == 0: # NDArrayBackedExtensionArray.fillna assumes axis=1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5d45b33871900..58918ed162c7c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -389,14 +389,9 @@ def diff(self: T, n: int, axis: AxisInt) -> T: return self.apply("diff", n=n, axis=axis) def interpolate(self: T, inplace: bool, **kwargs) -> T: - if inplace: - # TODO(CoW) can be optimized to only copy those blocks that have refs - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - self = self.copy() - - return self.apply("interpolate", inplace=inplace, **kwargs) + return self.apply( + "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() + ) def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T: axis = self._normalize_axis(axis) From 26f39726d16eac21d0f629ec7f68df3d444af8d4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 23:03:20 +0100 Subject: [PATCH 02/12] Fix and add first test --- pandas/conftest.py | 1 + pandas/tests/copy_view/test_methods.py | 19 ++++++++++++++++ .../tests/frame/methods/test_interpolate.py | 22 ++++++++----------- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..2a5c0395dfb4e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,6 +1944,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6b54345723118..34da1e2dfaa30 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1252,6 +1252,25 @@ def test_putmask(using_copy_on_write): assert view.iloc[0, 0] == 5 +@pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) +def test_interpolate_no_op(using_copy_on_write, method): + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = df.interpolate(method=method) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 20e4571e2ba05..4540a37cafb4a 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -95,14 +95,6 @@ def test_interp_basic_with_non_range_index(self): "D": list("abcd"), } ) - expected = DataFrame( - { - "A": [1.0, 2.0, 3.0, 4.0], - "B": [1.0, 4.0, 9.0, 9.0], - "C": [1, 2, 3, 5], - "D": list("abcd"), - } - ) result = df.set_index("C").interpolate() expected = df.set_index("C") @@ -327,20 +319,24 @@ def test_interp_raise_on_all_object_dtype(self): df.interpolate() def test_interp_inplace(self, using_copy_on_write): - # TODO(CoW) inplace keyword (it is still mutating the parent) - if using_copy_on_write: - pytest.skip("CoW: inplace keyword not yet handled") df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) + expected_cow = df.copy() result = df.copy() return_value = result["a"].interpolate(inplace=True) assert return_value is None - tm.assert_frame_equal(result, expected) + if using_copy_on_write: + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected_cow) result = df.copy() return_value = result["a"].interpolate(inplace=True, downcast="infer") assert return_value is None - tm.assert_frame_equal(result, expected.astype("int64")) + if using_copy_on_write: + tm.assert_frame_equal(result, expected.astype("int64")) + else: + tm.assert_frame_equal(result, expected_cow) def test_interp_inplace_row(self): # GH 10395 From 27648c6d0eda182aa6fc542fcbb8388b4a0d357e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 23:32:04 +0100 Subject: [PATCH 03/12] Add tests --- pandas/core/internals/blocks.py | 9 ++- pandas/tests/copy_view/test_methods.py | 89 ++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a37570da75d13..99cb01d6d9579 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2027,15 +2027,20 @@ def interpolate( # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs + refs = None if using_cow: - data_out = values._ndarray + if inplace and not self.refs.has_reference(): + data_out = values._ndarray + refs = self.refs + else: + data_out = values._ndarray.copy() else: data_out = values._ndarray if inplace else values._ndarray.copy() missing.interpolate_array_2d( data_out, method=method, limit=limit, index=index, axis=axis ) new_values = type(values)._simple_new(data_out, dtype=values.dtype) - return self.make_block_same_class(new_values, refs=self.refs) + return self.make_block_same_class(new_values, refs=refs) elif values.ndim == 2 and axis == 0: # NDArrayBackedExtensionArray.fillna assumes axis=1 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 34da1e2dfaa30..f30e78e17be2f 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -5,6 +5,7 @@ DataFrame, Index, MultiIndex, + NaT, Period, Series, Timestamp, @@ -1271,6 +1272,94 @@ def test_interpolate_no_op(using_copy_on_write, method): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) +def test_fill_functions(using_copy_on_write, func): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = getattr(df, func)() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_triggers_copy(using_copy_on_write, vals, func): + df = DataFrame({"a": vals}) + result = getattr(df, func)() + + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert result._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_no_reference(using_copy_on_write, vals): + df = DataFrame({"a": vals}) + arr = get_array(df, "a") + df.interpolate(method="linear", inplace=True) + + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_with_refs(using_copy_on_write, vals): + df = DataFrame({"a": [1, np.nan, 2]}) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + df.interpolate(method="linear", inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_interpolate_cleaned_fill_method(using_copy_on_write): + # Check that "method is set to None" case works correctly + df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df_orig = df.copy() + + result = df.interpolate(method="asfreq") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = Timestamp("2021-12-31") + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, From 80c95deb6bfb490605b49110f127f6d014c85789 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 23:32:35 +0100 Subject: [PATCH 04/12] Add tests --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2a5c0395dfb4e..2c410bb98b506 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,7 +1944,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From d13216120bdf2a8a5e15c20ea7c554cc70b51d9b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 22:53:05 +0100 Subject: [PATCH 05/12] ENH: Add CoW optimization to interpolate --- doc/source/whatsnew/v2.0.0.rst | 3 + pandas/core/internals/blocks.py | 5 +- pandas/tests/copy_view/test_interp_fillna.py | 164 +++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 108 ------------ 4 files changed, 171 insertions(+), 109 deletions(-) create mode 100644 pandas/tests/copy_view/test_interp_fillna.py diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 532881fee0892..315897b1a5025 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -223,6 +223,9 @@ Copy-on-Write improvements - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` + - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` + - :meth:`DataFrame.ffill` / :meth:`Series.ffill` + - :meth:`DataFrame.bfill` / :meth:`Series.bfill` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - :func:`concat` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 99cb01d6d9579..1944561827f97 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -433,7 +433,10 @@ def _maybe_downcast( # but ATM it breaks too much existing code. # split and convert the blocks - return extend_blocks([blk.convert() for blk in blocks]) + copy = True if not using_cow else False + return extend_blocks( + [blk.convert(using_cow=using_cow, copy=copy) for blk in blocks] + ) if downcast is None: return blocks diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py new file mode 100644 index 0000000000000..5e92d341dc6b4 --- /dev/null +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -0,0 +1,164 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) +def test_interpolate_no_op(using_copy_on_write, method): + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = df.interpolate(method=method) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) +def test_interp_fill_functions(using_copy_on_write, func): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = getattr(df, func)() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_triggers_copy(using_copy_on_write, vals, func): + df = DataFrame({"a": vals}) + result = getattr(df, func)() + + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert result._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): + df = DataFrame({"a": vals}) + arr = get_array(df, "a") + df.interpolate(method="linear", inplace=True) + + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_with_refs(using_copy_on_write, vals): + df = DataFrame({"a": [1, np.nan, 2]}) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + df.interpolate(method="linear", inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_interpolate_cleaned_fill_method(using_copy_on_write): + # Check that "method is set to None" case works correctly + df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df_orig = df.copy() + + result = df.interpolate(method="asfreq") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = Timestamp("2021-12-31") + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_interpolate_object_convert_no_op(using_copy_on_write): + # Check that "method is set to None" case works correctly + df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True) + + # No CoW makes a copy, it should not! + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_object_convert_copies(using_copy_on_write): + df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + arr_a = get_array(df, "a") + df.interpolate(method="pad", inplace=True, downcast="infer") + + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + df.interpolate(method="pad", inplace=True, downcast="infer") + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + else: + tm.assert_frame_equal(df, view) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index f30e78e17be2f..6b54345723118 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -5,7 +5,6 @@ DataFrame, Index, MultiIndex, - NaT, Period, Series, Timestamp, @@ -1253,113 +1252,6 @@ def test_putmask(using_copy_on_write): assert view.iloc[0, 0] == 5 -@pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) -def test_interpolate_no_op(using_copy_on_write, method): - df = DataFrame({"a": [1, 2]}) - df_orig = df.copy() - - result = df.interpolate(method=method) - - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - - result.iloc[0, 0] = 100 - - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - -@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) -def test_fill_functions(using_copy_on_write, func): - # Check that these takes the same code paths as interpolate - df = DataFrame({"a": [1, 2]}) - df_orig = df.copy() - - result = getattr(df, func)() - - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - - result.iloc[0, 0] = 100 - - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - -@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) -@pytest.mark.parametrize( - "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] -) -def test_interpolate_triggers_copy(using_copy_on_write, vals, func): - df = DataFrame({"a": vals}) - result = getattr(df, func)() - - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - if using_copy_on_write: - # Check that we don't have references when triggering a copy - assert result._mgr._has_no_reference(0) - - -@pytest.mark.parametrize( - "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] -) -def test_interpolate_inplace_no_reference(using_copy_on_write, vals): - df = DataFrame({"a": vals}) - arr = get_array(df, "a") - df.interpolate(method="linear", inplace=True) - - assert np.shares_memory(arr, get_array(df, "a")) - if using_copy_on_write: - # Check that we don't have references when triggering a copy - assert df._mgr._has_no_reference(0) - - -@pytest.mark.parametrize( - "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] -) -def test_interpolate_inplace_with_refs(using_copy_on_write, vals): - df = DataFrame({"a": [1, np.nan, 2]}) - df_orig = df.copy() - arr = get_array(df, "a") - view = df[:] - df.interpolate(method="linear", inplace=True) - - if using_copy_on_write: - # Check that copy was triggered in interpolate and that we don't - # have any references left - assert not np.shares_memory(arr, get_array(df, "a")) - tm.assert_frame_equal(df_orig, view) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - else: - assert np.shares_memory(arr, get_array(df, "a")) - - -def test_interpolate_cleaned_fill_method(using_copy_on_write): - # Check that "method is set to None" case works correctly - df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) - df_orig = df.copy() - - result = df.interpolate(method="asfreq") - - if using_copy_on_write: - assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - - result.iloc[0, 0] = Timestamp("2021-12-31") - - if using_copy_on_write: - assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, From 644af0c83c961417e2a2b53d8cecbcefa790ad2a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 22:58:28 +0100 Subject: [PATCH 06/12] Remove comment --- pandas/tests/copy_view/test_interp_fillna.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 5e92d341dc6b4..a6f7270f3f283 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -119,7 +119,6 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): def test_interpolate_object_convert_no_op(using_copy_on_write): - # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") df.interpolate(method="pad", inplace=True) From 425f8f44d74536a4caca621184eb48cb3f40ada3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 23:39:34 +0100 Subject: [PATCH 07/12] Fix --- pandas/conftest.py | 1 + pandas/tests/frame/methods/test_interpolate.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..2a5c0395dfb4e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,6 +1944,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 4540a37cafb4a..a9360b6f15a7c 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -326,17 +326,17 @@ def test_interp_inplace(self, using_copy_on_write): return_value = result["a"].interpolate(inplace=True) assert return_value is None if using_copy_on_write: - tm.assert_frame_equal(result, expected) - else: tm.assert_frame_equal(result, expected_cow) + else: + tm.assert_frame_equal(result, expected) result = df.copy() return_value = result["a"].interpolate(inplace=True, downcast="infer") assert return_value is None if using_copy_on_write: - tm.assert_frame_equal(result, expected.astype("int64")) - else: tm.assert_frame_equal(result, expected_cow) + else: + tm.assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): # GH 10395 From 3119c8412dc26f50936c3df22c899a3bfad990c3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 23:39:52 +0100 Subject: [PATCH 08/12] Fix --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2a5c0395dfb4e..2c410bb98b506 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,7 +1944,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 58bffc9487ad54adf5a1b33ffa044e0984c8f0b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 09:48:39 +0100 Subject: [PATCH 09/12] Fix tests --- pandas/tests/copy_view/test_interp_fillna.py | 5 +++-- pandas/tests/frame/methods/test_interpolate.py | 10 +++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index a6f7270f3f283..53cce8699acd0 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -30,7 +30,7 @@ def test_interpolate_no_op(using_copy_on_write, method): tm.assert_frame_equal(df, df_orig) -@pytest.mark.parametrize("func", ["pad", "ffill", "backfill", "bfill"]) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) def test_interp_fill_functions(using_copy_on_write, func): # Check that these takes the same code paths as interpolate df = DataFrame({"a": [1, 2]}) @@ -144,7 +144,8 @@ def test_interpolate_downcast(using_copy_on_write): arr_a = get_array(df, "a") df.interpolate(method="pad", inplace=True, downcast="infer") - assert df._mgr._has_no_reference(0) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a")) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index a9360b6f15a7c..d6da967106fe6 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -52,7 +52,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - def test_interp_basic(self): + def test_interp_basic(self, using_copy_on_write): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -75,8 +75,12 @@ def test_interp_basic(self): # check we didn't operate inplace GH#45791 cvalues = df["C"]._values dvalues = df["D"].values - assert not np.shares_memory(cvalues, result["C"]._values) - assert not np.shares_memory(dvalues, result["D"]._values) + if using_copy_on_write: + assert np.shares_memory(cvalues, result["C"]._values) + assert np.shares_memory(dvalues, result["D"]._values) + else: + assert not np.shares_memory(cvalues, result["C"]._values) + assert not np.shares_memory(dvalues, result["D"]._values) res = df.interpolate(inplace=True) assert res is None From 254de7d73d40fe4a9321bb0abae29ed9264135ce Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 12:05:28 +0100 Subject: [PATCH 10/12] Simplify --- pandas/conftest.py | 1 + pandas/core/internals/blocks.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..2a5c0395dfb4e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,6 +1944,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1944561827f97..a47be433df75f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -433,9 +433,8 @@ def _maybe_downcast( # but ATM it breaks too much existing code. # split and convert the blocks - copy = True if not using_cow else False return extend_blocks( - [blk.convert(using_cow=using_cow, copy=copy) for blk in blocks] + [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) if downcast is None: From 30ac5f3d737012cad5eeb6f03b125853fb8efcd6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 12:05:37 +0100 Subject: [PATCH 11/12] Simplify --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2a5c0395dfb4e..2c410bb98b506 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,7 +1944,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 64b40cea80f470408fe809a7783f46ffc4308c71 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 10 Feb 2023 11:27:00 +0100 Subject: [PATCH 12/12] Update pandas/tests/copy_view/test_interp_fillna.py Co-authored-by: Joris Van den Bossche --- pandas/tests/copy_view/test_interp_fillna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 53cce8699acd0..1bfcf7d180e54 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -123,7 +123,7 @@ def test_interpolate_object_convert_no_op(using_copy_on_write): arr_a = get_array(df, "a") df.interpolate(method="pad", inplace=True) - # No CoW makes a copy, it should not! + # Now CoW makes a copy, it should not! if using_copy_on_write: assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a"))