From 9b461250a7778057df86851bf892de92f68f09de Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 15:57:47 +0100 Subject: [PATCH 1/8] ENH: Optimize replace to avoid copying when not necessary --- pandas/core/internals/blocks.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4bb4882574228..2103037ac846f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,9 +11,12 @@ cast, final, ) +import weakref import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import ( Timestamp, internals as libinternals, @@ -549,14 +552,22 @@ def replace( # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - return [self] if inplace else [self.copy()] + if using_copy_on_write(): + result = self.copy(deep=False) + result._ref = weakref.ref(self) + else: + return [self] if inplace else [self.copy()] if mask is None: mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - return [self] if inplace else [self.copy()] + if using_copy_on_write(): + result = self.copy(deep=False) + result._ref = weakref.ref(self) + else: + return [self] if inplace else [self.copy()] elif self._can_hold_element(value): blk = self if inplace else self.copy() From 773b6ea01b92238bb45997f5eca1a1e02e676c77 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 19:09:04 +0000 Subject: [PATCH 2/8] ENH: Optimize replace to avoid copying when not necessary --- pandas/core/internals/blocks.py | 68 +++++++++++++++++++++----- pandas/core/internals/managers.py | 35 +++++++++++-- pandas/tests/copy_view/test_methods.py | 65 ++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2103037ac846f..20255ef38b973 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -15,8 +15,6 @@ import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import ( Timestamp, internals as libinternals, @@ -526,6 +524,8 @@ def replace( inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, + original_blocks: list[Block] = [], + using_copy_on_write: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -552,9 +552,12 @@ def replace( # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - if using_copy_on_write(): + if using_copy_on_write and original_blocks: result = self.copy(deep=False) - result._ref = weakref.ref(self) + result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + original_blocks[self.mgr_locs.as_array[0]] + ) + return [result] else: return [self] if inplace else [self.copy()] @@ -563,14 +566,26 @@ def replace( if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - if using_copy_on_write(): + if using_copy_on_write and original_blocks: result = self.copy(deep=False) - result._ref = weakref.ref(self) + result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + original_blocks[self.mgr_locs.as_array[0]] + ) + return [result] else: return [self] if inplace else [self.copy()] elif self._can_hold_element(value): - blk = self if inplace else self.copy() + # TODO(CoW): Maybe split here as well into columns where mask has True + # and rest? + if using_copy_on_write: + if original_blocks: + blk = self.copy() + else: + # In case we made a copy before, e.g. coerce to target dtype + blk = self + else: + blk = self if inplace else self.copy() putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise @@ -595,6 +610,13 @@ def replace( else: # split so that we only upcast where necessary blocks = [] + if original_blocks: + _original_blocks = [original_blocks[self.mgr_locs.as_array[0]]] * len( + self + ) + else: + _original_blocks = [] + for i, nb in enumerate(self._split()): blocks.extend( type(self).replace( @@ -603,6 +625,8 @@ def replace( value=value, inplace=True, mask=mask[i : i + 1], + original_blocks=[self] * (self.mgr_locs.as_array.max() + 1), + using_copy_on_write=using_copy_on_write, ) ) return blocks @@ -656,6 +680,8 @@ def replace_list( dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, + original_blocks: list[Block] = [], + using_copy_on_write: bool = False, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -668,7 +694,14 @@ def replace_list( ] if not len(pairs): # shortcut, nothing to replace - return [self] if inplace else [self.copy()] + if using_copy_on_write and original_blocks: + result = self.copy(deep=False) + result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + original_blocks[self.mgr_locs.as_array[0]] + ) + return [result] + else: + return [self] if inplace else [self.copy()] src_len = len(pairs) - 1 @@ -689,7 +722,11 @@ def replace_list( # ndarray]" masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] - rb = [self if inplace else self.copy()] + if using_copy_on_write: + # TODO(CoW): Optimize to avoid as many copies as possible + rb = [self.copy()] + else: + rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -712,8 +749,10 @@ def replace_list( to_replace=src, value=dest, mask=m, # type: ignore[arg-type] - inplace=inplace, + inplace=True, # We already made a copy if inplace=False regex=regex, + # TODO(CoW): Optimize to avoid as many copies as possible + using_copy_on_write=False, ) if convert and blk.is_object and not all(x is None for x in dest_list): # GH#44498 avoid unwanted cast-back @@ -730,6 +769,8 @@ def _replace_coerce( mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, + original_blocks: list[Block] = [], + using_copy_on_write: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -771,7 +812,12 @@ def _replace_coerce( return [nb] return [self] if inplace else [self.copy()] return self.replace( - to_replace=to_replace, value=value, inplace=inplace, mask=mask + to_replace=to_replace, + value=value, + inplace=inplace, + mask=mask, + original_blocks=original_blocks, + using_copy_on_write=using_copy_on_write, ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..8ec53b1e45565 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -449,10 +449,36 @@ def replace(self: T, to_replace, value, inplace: bool) -> T: # NDFrame.replace ensures the not-is_list_likes here assert not is_list_like(to_replace) assert not is_list_like(value) - return self.apply( - "replace", to_replace=to_replace, value=value, inplace=inplace + return self._call_function_and_update_refs( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, ) + def _call_function_and_update_refs(self, func, **kwargs): + if using_copy_on_write(): + use_cow = True + if self.is_single_block: + original_blocks = [self.blocks[0]] * self.shape[0] + else: + original_blocks = [self.blocks[i] for i in self.blknos] + else: + use_cow = False + original_blocks = [] + + mgr = self.apply( + func, + **kwargs, + original_blocks=original_blocks, + using_copy_on_write=use_cow, + ) + refs = [getattr(blk, "_ref", None) for blk in mgr.blocks] + if any(ref is not None for ref in refs): + mgr.refs = refs + mgr.parent = self + return mgr + def replace_regex(self, **kwargs): return self.apply("_replace_regex", **kwargs) @@ -466,14 +492,15 @@ def replace_list( """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") - bm = self.apply( + bm = self._call_function_and_update_refs( "replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, regex=regex, ) - bm._consolidate_inplace() + if not using_copy_on_write(): + bm._consolidate_inplace() return bm def to_native_types(self: T, **kwargs) -> T: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 8fd9d5c5126c1..4967e757b0b9d 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1029,6 +1029,71 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) +def test_replace_mask_all_false_second_block(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value=55.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "c"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + # TODO: This should split and not copy the whole block + # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + + +def test_replace_coerce_single_column(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value="a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +@pytest.mark.parametrize("to_replace", ["xxx", ["xxx"]]) +def test_replace_to_replace_wrong_dtype(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace=to_replace, value=1.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + def test_putmask(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) view = df[:] From e8f2c3f71407c2c91c0d5b194df4518777ec1e4c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 20 Jan 2023 20:36:08 +0000 Subject: [PATCH 3/8] Fix mypy --- pandas/core/internals/blocks.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 20255ef38b973..767d91e2be52b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -153,6 +153,7 @@ class Block(PandasObject): is_extension = False _can_consolidate = True _validate_ndim = True + _ref = None @final @cache_readonly @@ -554,7 +555,7 @@ def replace( # replace_list instead of replace. if using_copy_on_write and original_blocks: result = self.copy(deep=False) - result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + result._ref = result._ref = weakref.ref( original_blocks[self.mgr_locs.as_array[0]] ) return [result] @@ -568,7 +569,7 @@ def replace( # bc _can_hold_element is incorrect. if using_copy_on_write and original_blocks: result = self.copy(deep=False) - result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + result._ref = result._ref = weakref.ref( original_blocks[self.mgr_locs.as_array[0]] ) return [result] @@ -695,11 +696,11 @@ def replace_list( if not len(pairs): # shortcut, nothing to replace if using_copy_on_write and original_blocks: - result = self.copy(deep=False) - result._ref = result._ref = weakref.ref( # type: ignore[attr-defined] + nb = self.copy(deep=False) + nb._ref = nb._ref = weakref.ref( original_blocks[self.mgr_locs.as_array[0]] ) - return [result] + return [nb] else: return [self] if inplace else [self.copy()] From 2f8d719ac9c48d78cf75b0daca64f76f0785f927 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Jan 2023 09:55:58 -0500 Subject: [PATCH 4/8] Fix test --- pandas/tests/copy_view/test_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index fa15d3aca8ee1..b6226a40d3016 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1125,7 +1125,7 @@ def test_replace_mask_all_false_second_block(using_copy_on_write): # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) -def test_replace_coerce_single_column(using_copy_on_write): +def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() @@ -1135,7 +1135,7 @@ def test_replace_coerce_single_column(using_copy_on_write): assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - else: + elif not using_array_manager: assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) From af83e7a090f01c39439013382c97d5d82a9211dd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 21:19:34 +0100 Subject: [PATCH 5/8] Update for new ref tracking --- pandas/core/internals/blocks.py | 67 +++--------- pandas/core/internals/managers.py | 31 +----- pandas/tests/copy_view/test_methods.py | 103 ----------------- pandas/tests/copy_view/test_replace.py | 146 +++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 183 deletions(-) create mode 100644 pandas/tests/copy_view/test_replace.py diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2f6b303d1ebda..3503173a833f4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,7 +11,6 @@ cast, final, ) -import weakref import numpy as np @@ -160,7 +159,6 @@ class Block(PandasObject): is_extension = False _can_consolidate = True _validate_ndim = True - _ref = None @final @cache_readonly @@ -547,8 +545,7 @@ def replace( inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, - original_blocks: list[Block] = [], - using_copy_on_write: bool = False, + using_cow: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -573,12 +570,8 @@ def replace( # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - if using_copy_on_write and original_blocks: - result = self.copy(deep=False) - result._ref = result._ref = weakref.ref( - original_blocks[self.mgr_locs.as_array[0]] - ) - return [result] + if using_cow: + return [self.copy(deep=False)] else: return [self] if inplace else [self.copy()] @@ -587,31 +580,26 @@ def replace( if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - if using_copy_on_write and original_blocks: - result = self.copy(deep=False) - result._ref = result._ref = weakref.ref( - original_blocks[self.mgr_locs.as_array[0]] - ) - return [result] + if using_cow: + return [self.copy(deep=False)] else: return [self] if inplace else [self.copy()] elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - if using_copy_on_write: - if original_blocks: - blk = self.copy() + if using_cow: + if inplace: + blk = self.copy(deep=self.refs.has_reference()) else: - # In case we made a copy before, e.g. coerce to target dtype - blk = self + blk = self.copy() else: blk = self if inplace else self.copy() putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False) + blocks = blk.convert(copy=False, using_cow=using_cow) else: blocks = [blk] return blocks @@ -631,13 +619,6 @@ def replace( else: # split so that we only upcast where necessary blocks = [] - if original_blocks: - _original_blocks = [original_blocks[self.mgr_locs.as_array[0]]] * len( - self - ) - else: - _original_blocks = [] - for i, nb in enumerate(self._split()): blocks.extend( type(self).replace( @@ -646,8 +627,7 @@ def replace( value=value, inplace=True, mask=mask[i : i + 1], - original_blocks=[self] * (self.mgr_locs.as_array.max() + 1), - using_copy_on_write=using_copy_on_write, + using_cow=using_cow, ) ) return blocks @@ -701,8 +681,6 @@ def replace_list( dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, - original_blocks: list[Block] = [], - using_copy_on_write: bool = False, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -723,14 +701,7 @@ def replace_list( ] if not len(pairs): # shortcut, nothing to replace - if using_copy_on_write and original_blocks: - nb = self.copy(deep=False) - nb._ref = nb._ref = weakref.ref( - original_blocks[self.mgr_locs.as_array[0]] - ) - return [nb] - else: - return [self] if inplace else [self.copy()] + return [self] if inplace else [self.copy()] src_len = len(pairs) - 1 @@ -748,11 +719,7 @@ def replace_list( masks = [extract_bool_array(x) for x in masks] - if using_copy_on_write: - # TODO(CoW): Optimize to avoid as many copies as possible - rb = [self.copy()] - else: - rb = [self if inplace else self.copy()] + rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -775,10 +742,8 @@ def replace_list( to_replace=src, value=dest, mask=m, # type: ignore[arg-type] - inplace=True, # We already made a copy if inplace=False + inplace=True, regex=regex, - # TODO(CoW): Optimize to avoid as many copies as possible - using_copy_on_write=False, ) if convert and blk.is_object and not all(x is None for x in dest_list): # GH#44498 avoid unwanted cast-back @@ -795,8 +760,6 @@ def _replace_coerce( mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, - original_blocks: list[Block] = [], - using_copy_on_write: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -842,8 +805,6 @@ def _replace_coerce( value=value, inplace=inplace, mask=mask, - original_blocks=original_blocks, - using_copy_on_write=using_copy_on_write, ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 951c916a96fc2..ded99f544ccf0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -450,36 +450,14 @@ def replace(self: T, to_replace, value, inplace: bool) -> T: # NDFrame.replace ensures the not-is_list_likes here assert not is_list_like(to_replace) assert not is_list_like(value) - return self._call_function_and_update_refs( + return self.apply( "replace", to_replace=to_replace, value=value, inplace=inplace, + using_cow=using_copy_on_write(), ) - def _call_function_and_update_refs(self, func, **kwargs): - if using_copy_on_write(): - use_cow = True - if self.is_single_block: - original_blocks = [self.blocks[0]] * self.shape[0] - else: - original_blocks = [self.blocks[i] for i in self.blknos] - else: - use_cow = False - original_blocks = [] - - mgr = self.apply( - func, - **kwargs, - original_blocks=original_blocks, - using_copy_on_write=use_cow, - ) - refs = [getattr(blk, "_ref", None) for blk in mgr.blocks] - if any(ref is not None for ref in refs): - mgr.refs = refs - mgr.parent = self - return mgr - def replace_regex(self, **kwargs): return self.apply("_replace_regex", **kwargs) @@ -493,15 +471,14 @@ def replace_list( """do a list replace""" inplace = validate_bool_kwarg(inplace, "inplace") - bm = self._call_function_and_update_refs( + bm = self.apply( "replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, regex=regex, ) - if not using_copy_on_write(): - bm._consolidate_inplace() + bm._consolidate_inplace() return bm def to_native_types(self: T, **kwargs) -> T: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b1345c4a9db82..27e49aa5560d0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1199,109 +1199,6 @@ def test_items(using_copy_on_write): assert df.loc[0, name] == 0 -@pytest.mark.parametrize( - "replace_kwargs", - [ - {"to_replace": {"a": 1, "b": 4}, "value": -1}, - # Test CoW splits blocks to avoid copying unchanged columns - {"to_replace": {"a": 1}, "value": -1}, - {"to_replace": {"b": 4}, "value": -1}, - {"to_replace": {"b": {4: 1}}}, - # TODO: Add these in a further optimization - # We would need to see which columns got replaced in the mask - # which could be expensive - # {"to_replace": {"b": 1}}, - # 1 - ], -) -def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) - df_orig = df.copy() - - df_replaced = df.replace(**replace_kwargs) - - if using_copy_on_write: - if (df_replaced["b"] == df["b"]).all(): - assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) - - # mutating squeezed df triggers a copy-on-write for that column/block - df_replaced.loc[0, "c"] = -1 - if using_copy_on_write: - assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) - - if "a" in replace_kwargs["to_replace"]: - arr = get_array(df_replaced, "a") - df_replaced.loc[0, "a"] = 100 - assert np.shares_memory(get_array(df_replaced, "a"), arr) - tm.assert_frame_equal(df, df_orig) - - -def test_replace_mask_all_false_second_block(using_copy_on_write): - df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) - df_orig = df.copy() - - df2 = df.replace(to_replace=1.5, value=55.5) - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - else: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - df2.loc[0, "c"] = 0.5 - tm.assert_frame_equal(df, df_orig) # Original is unchanged - - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) - # TODO: This should split and not copy the whole block - # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) - - -def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): - df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) - df_orig = df.copy() - - df2 = df.replace(to_replace=1.5, value="a") - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - elif not using_array_manager: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - if using_copy_on_write: - df2.loc[0, "b"] = 0.5 - tm.assert_frame_equal(df, df_orig) # Original is unchanged - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - - -@pytest.mark.parametrize("to_replace", ["xxx", ["xxx"]]) -def test_replace_to_replace_wrong_dtype(using_copy_on_write, to_replace): - df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) - df_orig = df.copy() - - df2 = df.replace(to_replace=to_replace, value=1.5) - - if using_copy_on_write: - assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - else: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - - df2.loc[0, "b"] = 0.5 - tm.assert_frame_equal(df, df_orig) # Original is unchanged - - if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) - - def test_putmask(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) view = df[:] diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py new file mode 100644 index 0000000000000..7a4dc83786558 --- /dev/null +++ b/pandas/tests/copy_view/test_replace.py @@ -0,0 +1,146 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize( + "replace_kwargs", + [ + {"to_replace": {"a": 1, "b": 4}, "value": -1}, + # Test CoW splits blocks to avoid copying unchanged columns + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive + # {"to_replace": {"b": 1}}, + # 1 + ], +) +def test_replace(using_copy_on_write, replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(**replace_kwargs) + + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + if "a" in replace_kwargs["to_replace"]: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_mask_all_false_second_block(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value=55.5) + + if using_copy_on_write: + # TODO: Block splitting would allow us to avoid copying b + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "c"] = 1 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + # TODO: This should split and not copy the whole block + # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + + +def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value="a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + elif not using_array_manager: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_to_replace_wrong_dtype(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace="xxx", value=1.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_inplace(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + df.replace(to_replace=1.5, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_replace_inplace_reference(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + view = df[:] # noqa + df.replace(to_replace=1.5, value=15.5, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + + +@pytest.mark.parametrize("to_replace", ["a", 100.5]) +def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + view = df[:] # noqa + df.replace(to_replace=to_replace, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) From b52ec8996860bde700d8b5f691a51fd4268c075a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 21:24:30 +0100 Subject: [PATCH 6/8] Fix inplace argument --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3503173a833f4..40a4846de9568 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -742,7 +742,7 @@ def replace_list( to_replace=src, value=dest, mask=m, # type: ignore[arg-type] - inplace=True, + inplace=inplace, regex=regex, ) if convert and blk.is_object and not all(x is None for x in dest_list): From ce2944258486583ea7bf840febb190405fe21d27 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 21:51:43 +0100 Subject: [PATCH 7/8] Add cat tests --- pandas/core/internals/blocks.py | 7 +++- pandas/tests/copy_view/test_replace.py | 53 +++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 40a4846de9568..06efc9673f660 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -560,7 +560,12 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self if inplace else self.copy() + if using_cow and (self.refs.has_reference() or not inplace): + blk = self.copy() + elif using_cow: + blk = self.copy(deep=False) + else: + blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 7a4dc83786558..0572550706007 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import ( + Categorical, + DataFrame, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -144,3 +147,51 @@ def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): if using_copy_on_write: assert not df._mgr._has_no_reference(0) assert not view._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace_reference(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] # noqa + df.replace(to_replace=1, value=val, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + arr_a = get_array(df, "a") + df.replace(to_replace=1, value=val, inplace=True) + + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + expected = DataFrame({"a": Categorical([val, 2, 3])}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + df2 = df.replace(to_replace=1, value=val) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) + + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a) From 5467d2ac94165ed03e82773ba66de581a123612a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 16:59:15 +0100 Subject: [PATCH 8/8] Merge main --- pandas/tests/copy_view/test_replace.py | 181 +++++++++++++++++++++++-- 1 file changed, 171 insertions(+), 10 deletions(-) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index a1347d8e12950..46335747268fe 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Categorical, @@ -8,31 +9,191 @@ from pandas.tests.copy_view.util import get_array -def test_replace_categorical_inplace_reference(using_copy_on_write): - df = DataFrame({"a": Categorical([1, 2, 3])}) +@pytest.mark.parametrize( + "replace_kwargs", + [ + {"to_replace": {"a": 1, "b": 4}, "value": -1}, + # Test CoW splits blocks to avoid copying unchanged columns + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive + # {"to_replace": {"b": 1}}, + # 1 + ], +) +def test_replace(using_copy_on_write, replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(**replace_kwargs) + + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + if "a" in replace_kwargs["to_replace"]: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_mask_all_false_second_block(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value=55.5) + + if using_copy_on_write: + # TODO: Block splitting would allow us to avoid copying b + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "c"] = 1 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + # TODO: This should split and not copy the whole block + # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + + +def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value="a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + elif not using_array_manager: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_to_replace_wrong_dtype(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace="xxx", value=1.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_inplace(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + df.replace(to_replace=1.5, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1.5, [1.5]]) +def test_replace_inplace_reference(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] # noqa - df.replace(to_replace=[1], value=2, inplace=True) + df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) else: - assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert np.shares_memory(get_array(df, "a"), arr_a) -def test_replace_inplace_reference(using_copy_on_write): +@pytest.mark.parametrize("to_replace", ["a", 100.5]) +def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] # noqa - df.replace(to_replace=[1.5], value=15.5, inplace=True) + df.replace(to_replace=to_replace, value=15.5, inplace=True) + assert np.shares_memory(get_array(df, "a"), arr_a) if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1, [1]]) +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] # noqa + df.replace(to_replace=to_replace, value=val, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) else: - assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + arr_a = get_array(df, "a") + df.replace(to_replace=1, value=val, inplace=True) + + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + expected = DataFrame({"a": Categorical([val, 2, 3])}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + df2 = df.replace(to_replace=1, value=val) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) + + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a)