From f5f4747631504f97dfb7911654ae7ffd44f90f4e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Wed, 24 Mar 2021 21:51:54 -0400 Subject: [PATCH] Backport PR #40604: REGR: replace with multivalued regex raising --- doc/source/whatsnew/v1.2.4.rst | 1 + pandas/core/internals/blocks.py | 16 +++++++++++++--- pandas/tests/frame/methods/test_replace.py | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst index 45d131327630e..26d768f830830 100644 --- a/doc/source/whatsnew/v1.2.4.rst +++ b/doc/source/whatsnew/v1.2.4.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`) - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`) - Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`) +- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5bc820e76bff1..b6bca855a9f05 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -894,10 +894,20 @@ def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): + convert = i == src_len # only convert once at the end new_rb: List["Block"] = [] - for blk in rb: - m = masks[i] - convert = i == src_len # only convert once at the end + + # GH-39338: _replace_coerce can split a block into + # single-column blocks, so track the index so we know + # where to index into the mask + for blk_num, blk in enumerate(rb): + if len(rb) == 1: + m = masks[i] + else: + mib = masks[i] + assert not isinstance(mib, bool) + m = mib[blk_num : blk_num + 1] + result = blk._replace_coerce( to_replace=src, value=dest, diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 1b570028964df..c4f2e09911b34 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -644,6 +644,28 @@ def test_regex_replace_numeric_to_object_conversion(self, mix_abc): tm.assert_frame_equal(res, expec) assert res.a.dtype == np.object_ + @pytest.mark.parametrize( + "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}] + ) + def test_joint_simple_replace_and_regex_replace(self, to_replace): + # GH-39338 + df = DataFrame( + { + "col1": ["1,000", "a", "3"], + "col2": ["a", "", "b"], + "col3": ["a", "b", "c"], + } + ) + result = df.replace(regex=to_replace) + expected = DataFrame( + { + "col1": ["1000", "a", "3"], + "col2": ["a", np.nan, "b"], + "col3": ["a", "b", "c"], + } + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): df = DataFrame({"a": [metachar, "else"]})