From d1c6da1617ee7542dac809ef5bc8099ce7a1b41d Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 16 Mar 2024 09:53:04 -0400 Subject: [PATCH 1/5] update test for GH#56599 --- pandas/tests/series/methods/test_replace.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index d4ec09332ad97..e914fafa92ea7 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -616,7 +616,8 @@ def test_replace_with_compiled_regex(self): def test_pandas_replace_na(self): # GH#43344 - ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") + # GH#56599 + ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA, "AA"], dtype="string") regex_mapping = { "AA": "CC", "BB": "CC", @@ -624,7 +625,7 @@ def test_pandas_replace_na(self): "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) - exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") + exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA, "CC"], dtype="string") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( From 0f6dd6f762938fe59fccaec30b14cef412ea7e94 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 16 Mar 2024 09:55:21 -0400 Subject: [PATCH 2/5] bug: ser/df.replace only replaces first occurence with NAs --- pandas/core/array_algos/replace.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 6cc867c60fd82..f946c5adcbb0b 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -93,17 +93,18 @@ def _check_comparison_types( ) # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray): + if isinstance(a, np.ndarray) and mask is not None: a = a[mask] - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - np.place(tmp, mask, result) - result = tmp + result = op(a) + + if isinstance(result, np.ndarray): + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + np.place(tmp, mask, result) + result = tmp + else: + result = op(a) _check_comparison_types(result, a, b) return result From e5996a4760f07dba747fa250f89c8207cbee2cfe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:06:02 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/series/methods/test_replace.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index e914fafa92ea7..c7b894e73d0dd 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -625,7 +625,9 @@ def test_pandas_replace_na(self): "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) - exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA, "CC"], dtype="string") + exp = pd.Series( + ["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA, "CC"], dtype="string" + ) tm.assert_series_equal(result, exp) @pytest.mark.parametrize( From 624a541e0fb7e21aac0048bebf617c5d20194836 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 16 Mar 2024 19:55:02 -0400 Subject: [PATCH 4/5] add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7263329d2e53b..b32090c9bfe24 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -296,6 +296,7 @@ Bug fixes - Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) +- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when `regex=True` and missing values are present. (:issue:`56599`) Categorical ^^^^^^^^^^^ From 30961070ef374741e0745fcefdca721c5f64d248 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 16 Mar 2024 20:20:25 -0400 Subject: [PATCH 5/5] fmt fix --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b32090c9bfe24..dfbebae8d42b6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -296,7 +296,7 @@ Bug fixes - Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) -- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when `regex=True` and missing values are present. (:issue:`56599`) +- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) Categorical ^^^^^^^^^^^