From d8485666678a009c8263954d3fa9d971c34683a6 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Mon, 15 Apr 2024 23:54:28 +0200 Subject: [PATCH 1/8] enforce depr behavior df.replace / s.replace with CategoricalDtype --- pandas/core/arrays/categorical.py | 10 ++----- pandas/tests/series/methods/test_replace.py | 29 ++++----------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8d6880fc2acb3..4bc2782c84581 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2680,15 +2680,9 @@ def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: NDArrayBacked.__init__(cat, new_codes, new_dtype) if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 + raise TypeError( "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), + "CategoricalDtype is not supported." ) if not inplace: return cat diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0a79bcea679a7..5a09637182ed3 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -370,9 +370,7 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -383,14 +381,12 @@ def test_replace_categorical(self, categorical, numeric): @pytest.mark.parametrize( "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace_raises(self, data, data_exp): # GH 53358 result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") - tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): # GH 26988 @@ -404,25 +400,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError From 508b0ffb94464050d70d8e4d390a4605ab21985a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 16 Apr 2024 12:09:53 +0200 Subject: [PATCH 2/8] fixup tests in frame/methods/test_replace.py --- pandas/tests/frame/methods/test_replace.py | 57 +++++----------------- 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fb7ba2b7af38a..4f4ff142e7121 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1171,38 +1171,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1345,15 +1313,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1378,12 +1348,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) From 087dcc7eb77b3359c589bef12581996e17f8a1f0 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 16 Apr 2024 14:47:43 +0200 Subject: [PATCH 3/8] fixup tests in arrays/categorical/test_replace.py and pandas/tests/copy_view/test_replace.py --- .../tests/arrays/categorical/test_replace.py | 72 +++++----------- pandas/tests/copy_view/test_replace.py | 83 +++++++++---------- 2 files changed, 60 insertions(+), 95 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..1034cade6e073 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -9,31 +9,12 @@ "to_replace,value,expected,flip_categories", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), (4, 1, [1, 2, 3], False), (5, 6, [1, 2, 3], False), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 @@ -63,49 +44,36 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): + + if expected_error_msg is None: result = pd.Series(cat, copy=False).replace(to_replace, value)._values + tm.assert_categorical_equal(result, expected) + elif value is not None: + result = ( + pd.Series(cat, copy=False) + .cat.rename_categories({to_replace: value}) + ._values + ) + tm.assert_categorical_equal(result, expected) - tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged with pytest.raises(AssertionError, match=expected_error_msg): # ensure non-inplace call does not affect original tm.assert_categorical_equal(cat, expected) ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): + if expected_error_msg is None: ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) + tm.assert_categorical_equal(cat, expected) + else: + msg2 = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg2): + ser.replace(to_replace, value, inplace=True) -def test_replace_categorical_ea_dtype(): +def test_replace_categorical_ea_dtype_raises(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) - tm.assert_categorical_equal(result, expected) - - -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) + msg2 = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg2): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 63254f1244a2e..c6db850c5317a 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -129,18 +129,18 @@ def test_replace_to_replace_wrong_dtype(): def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): + + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): df.replace(["c"], value="a", inplace=True) + df.apply(lambda x: x.cat.rename_categories({"c": "a"})) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + with pytest.raises(TypeError, match=msg): + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -150,13 +150,12 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes) + + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace(["c"], value="d", inplace=True) + df.apply(lambda x: x.cat.rename_categories({"c": "d"})) + tm.assert_frame_equal(df_orig, view) @@ -201,30 +200,29 @@ def test_replace_categorical_inplace_reference(val, to_replace): df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): + if val == 1.5: + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) + else: df.replace(to_replace=to_replace, value=val, inplace=True) - assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) @pytest.mark.parametrize("val", [1, 1.5]) def test_replace_categorical_inplace(val): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): + + if val == 1.5: + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace(to_replace=1, value=val, inplace=True) + else: df.replace(to_replace=1, value=val, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) @@ -238,22 +236,21 @@ def test_replace_categorical_inplace(val): def test_replace_categorical(val): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): + if val == 1.5: + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace(to_replace=1, value=val) + else: df2 = df.replace(to_replace=1, value=val) - assert df._mgr._has_no_reference(0) - assert df2._mgr._has_no_reference(0) - assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) - tm.assert_frame_equal(df, df_orig) + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) - arr_a = get_array(df2, "a").codes - df2.iloc[0, 0] = 2.0 - assert np.shares_memory(get_array(df2, "a").codes, arr_a) + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a) @pytest.mark.parametrize("method", ["where", "mask"]) From ad6f12f4b7868149e7b1e97e0cc9f7a0644b8347 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 16 Apr 2024 14:51:45 +0200 Subject: [PATCH 4/8] add a note to v3.0.0 --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f709bec842c86..cd4350a7413b6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -246,6 +246,7 @@ Removal of prior version deprecations/changes - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` (:issue:`58270`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) From 85a0fd190438268946e4fd35e597d2670cf26c70 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 18 Apr 2024 10:35:24 +0200 Subject: [PATCH 5/8] remove _replace and special-casing, fix tests --- pandas/core/arrays/categorical.py | 50 ---------------------- pandas/core/internals/blocks.py | 20 ++++----- pandas/tests/copy_view/test_replace.py | 58 ++++---------------------- 3 files changed, 17 insertions(+), 111 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4bc2782c84581..7fba7dcdfe46c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2638,56 +2638,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - @overload - def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ... - - @overload - def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ... - - def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - index_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, index_categories, copy=False - ) - new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - raise TypeError( - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is not supported." - ) - if not inplace: - return cat - return None - # ------------------------------------------------------------------------ # String methods interface def _str_map( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..85e9fa57a36b0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -683,12 +683,10 @@ def replace( values = self.values if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] + raise TypeError( + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is not supported." + ) if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that @@ -790,12 +788,10 @@ def replace_list( values = self.values if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] + raise TypeError( + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is not supported." + ) # Exclude anything that we know we won't contain pairs = [ diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index c6db850c5317a..56a1c725ae507 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -133,7 +133,7 @@ def test_replace_list_categorical(): msg = "with CategoricalDtype is not supported" with pytest.raises(TypeError, match=msg): df.replace(["c"], value="a", inplace=True) - df.apply(lambda x: x.cat.rename_categories({"c": "a"})) + df.apply(lambda x: x.cat.rename_categories({"c": "d"})) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) @@ -197,61 +197,21 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("val", [1, 1.5]) def test_replace_categorical_inplace_reference(val, to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) - df_orig = df.copy() - arr_a = get_array(df, "a") - view = df[:] - if val == 1.5: - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - else: + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): df.replace(to_replace=to_replace, value=val, inplace=True) - assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) - assert df._mgr._has_no_reference(0) - assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) - -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(val): +@pytest.mark.parametrize("val", [1]) +def test_replace_categorical_raises(val): df = DataFrame({"a": Categorical([1, 2, 3])}) - arr_a = get_array(df, "a") + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace(to_replace=1, value=val) - if val == 1.5: - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=1, value=val, inplace=True) - else: + with pytest.raises(TypeError, match=msg): df.replace(to_replace=1, value=val, inplace=True) - assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) - assert df._mgr._has_no_reference(0) - - expected = DataFrame({"a": Categorical([val, 2, 3])}) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(val): - df = DataFrame({"a": Categorical([1, 2, 3])}) - df_orig = df.copy() - if val == 1.5: - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=1, value=val) - else: - df2 = df.replace(to_replace=1, value=val) - - assert df._mgr._has_no_reference(0) - assert df2._mgr._has_no_reference(0) - assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) - tm.assert_frame_equal(df, df_orig) - - arr_a = get_array(df2, "a").codes - df2.iloc[0, 0] = 2.0 - assert np.shares_memory(get_array(df2, "a").codes, arr_a) - @pytest.mark.parametrize("method", ["where", "mask"]) def test_masking_inplace(method): From c23e34a6ef768b80388a4b4b8bfb07bd92dc3b0a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 2 May 2024 19:50:48 +0200 Subject: [PATCH 6/8] fix tests --- .../tests/arrays/categorical/test_replace.py | 15 ++----- pandas/tests/frame/methods/test_replace.py | 45 +++++-------------- 2 files changed, 15 insertions(+), 45 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 1034cade6e073..a08029fa6a53e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -10,7 +10,6 @@ [ # one-to-one (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), # many-to-one ((5, 6), 2, [1, 2, 3], False), ], @@ -19,24 +18,16 @@ def test_replace_categorical_series(to_replace, value, expected, flip_categories # GH 31720 ser = pd.Series([1, 2, 3], dtype="category") - result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + ser.replace(to_replace, value) @pytest.mark.parametrize( "to_replace, value, result, expected_error_msg", [ ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), ("b", None, ["a", None], "Categorical.categories length are different"), ], ) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 4f4ff142e7121..bcb184eeb4a60 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -705,25 +705,6 @@ def test_replace_NAT_with_None(self): expected = DataFrame([None, None]) tm.assert_frame_equal(result, expected) - def test_replace_with_None_keeps_categorical(self): - # gh-46634 - cat_series = Series(["b", "b", "b", "d"], dtype="category") - df = DataFrame( - { - "id": Series([5, 4, 3, 2], dtype="float64"), - "col": cat_series, - } - ) - result = df.replace({3: None}) - - expected = DataFrame( - { - "id": Series([5.0, 4.0, None, 2.0], dtype="object"), - "col": cat_series, - } - ) - tm.assert_frame_equal(result, expected) - def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] @@ -1424,6 +1405,18 @@ def test_replace_with_nil_na(self): result = ser.replace("nil", "anything else") tm.assert_frame_equal(expected, result) + def test_replace_with_categorical_raises(self): + input_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "d"], + "col4": ["cat1", "cat2", "cat3", "cat4"], + } + df = DataFrame(data=input_dict).astype({"col2": "category", "col4": "category"}) + + msg = "with CategoricalDtype is not supported" + with pytest.raises(TypeError, match=msg): + df.replace({3: None}) + class TestDataFrameReplaceRegex: @pytest.mark.parametrize( @@ -1491,20 +1484,6 @@ def test_replace_with_value_also_being_replaced(self): expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) tm.assert_frame_equal(result, expected) - def test_replace_categorical_no_replacement(self): - # GH#46672 - df = DataFrame( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - dtype="category", - ) - expected = df.copy() - - result = df.replace(to_replace=[".", "def"], value=["_", None]) - tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) From d8512c188e9bc2e554b8dad339b125f807fb2d12 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jun 2024 12:33:46 -0700 Subject: [PATCH 7/8] Adjust tests and clarify whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/internals/blocks.py | 13 --- .../tests/arrays/categorical/test_replace.py | 85 ++++++++++--------- pandas/tests/copy_view/test_replace.py | 59 +++++++------ pandas/tests/frame/methods/test_replace.py | 69 ++++++++++++--- pandas/tests/series/methods/test_replace.py | 13 ++- 6 files changed, 142 insertions(+), 99 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0e92bddcbeb36..bb8def5fb0d47 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -381,7 +381,7 @@ Other Removals - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) -- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` (:issue:`58270`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 72e77d82da1f3..6bb335bca12b3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -100,7 +100,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -696,12 +695,6 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - raise TypeError( - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is not supported." - ) - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -801,12 +794,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - raise TypeError( - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is not supported." - ) - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index a08029fa6a53e..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,65 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (4, 1, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ((5, 6), 2, [1, 2, 3], False), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - ser.replace(to_replace, value) + result = ser.replace(to_replace, value) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - # https://github.com/pandas-dev/pandas/issues/33288 - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - if expected_error_msg is None: - result = pd.Series(cat, copy=False).replace(to_replace, value)._values - tm.assert_categorical_equal(result, expected) - elif value is not None: - result = ( - pd.Series(cat, copy=False) - .cat.rename_categories({to_replace: value}) - ._values - ) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) - ser = pd.Series(cat, copy=False) - if expected_error_msg is None: - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) - else: - msg2 = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg2): - ser.replace(to_replace, value, inplace=True) + +def test_replace_categorical_ea_dtype(): + # GH49404 + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), + ) + tm.assert_categorical_equal(result, expected) -def test_replace_categorical_ea_dtype_raises(): +def test_replace_categorical_ea_dtype_different_cats_raises(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg2 = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg2): - pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 56a1c725ae507..2eb88923c0087 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -130,16 +130,12 @@ def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(["c"], value="a", inplace=True) - df.apply(lambda x: x.cat.rename_categories({"c": "d"})) + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with pytest.raises(TypeError, match=msg): - df.replace(["b"], value="a") + df.replace(["b"], value="a") df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) @@ -150,12 +146,7 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(["c"], value="d", inplace=True) - df.apply(lambda x: x.cat.rename_categories({"c": "d"})) - + df.replace(["c"], value="a", inplace=True) tm.assert_frame_equal(df_orig, view) @@ -194,23 +185,43 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + df.replace(to_replace=to_replace, value=1, inplace=True) + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) -@pytest.mark.parametrize("val", [1]) -def test_replace_categorical_raises(val): +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=1, value=val) + arr_a = get_array(df, "a") + df.replace(to_replace=1, value=1, inplace=True) + + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + + expected = DataFrame({"a": Categorical([1, 2, 3])}) + tm.assert_frame_equal(df, expected) + + +def test_replace_categorical(): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + df2 = df.replace(to_replace=1, value=1) + + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) - with pytest.raises(TypeError, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a) @pytest.mark.parametrize("method", ["where", "mask"]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index bcb184eeb4a60..3fcc4aaa6960f 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -705,6 +705,25 @@ def test_replace_NAT_with_None(self): expected = DataFrame([None, None]) tm.assert_frame_equal(result, expected) + def test_replace_with_None_keeps_categorical(self): + # gh-46634 + cat_series = Series(["b", "b", "b", "d"], dtype="category") + df = DataFrame( + { + "id": Series([5, 4, 3, 2], dtype="float64"), + "col": cat_series, + } + ) + result = df.replace({3: None}) + + expected = DataFrame( + { + "id": Series([5.0, 4.0, None, 2.0], dtype="object"), + "col": cat_series, + } + ) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] @@ -1249,6 +1268,30 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1405,18 +1448,6 @@ def test_replace_with_nil_na(self): result = ser.replace("nil", "anything else") tm.assert_frame_equal(expected, result) - def test_replace_with_categorical_raises(self): - input_dict = { - "col1": [1, 2, 3, 4], - "col2": ["a", "b", "c", "d"], - "col4": ["cat1", "cat2", "cat3", "cat4"], - } - df = DataFrame(data=input_dict).astype({"col2": "category", "col4": "category"}) - - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - df.replace({3: None}) - class TestDataFrameReplaceRegex: @pytest.mark.parametrize( @@ -1484,6 +1515,20 @@ def test_replace_with_value_also_being_replaced(self): expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) tm.assert_frame_equal(result, expected) + def test_replace_categorical_no_replacement(self): + # GH#46672 + df = DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", + ) + expected = df.copy() + + result = df.replace(to_replace=[".", "def"], value=["_", None]) + tm.assert_frame_equal(result, expected) + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 5a09637182ed3..90654df155cf0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -378,15 +378,14 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace_raises(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is not supported" - with pytest.raises(TypeError, match=msg): - result.replace(to_replace="a", value="b", inplace=True) + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) + tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): # GH 26988 From d17f47a77929f9760fe4573902ed406757271ad7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:13:07 -0700 Subject: [PATCH 8/8] Fix pre-commit --- pandas/core/arrays/categorical.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 028b1396371d4..18b52f741370f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -10,7 +10,6 @@ cast, overload, ) -import warnings import numpy as np @@ -23,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import (