From a33339d5e7ab440bb11a9c5c15bd3b9818478869 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 3 May 2021 22:38:44 +0100 Subject: [PATCH] [ArrowStringArray] pre-cursor to adding replace str accessor method --- pandas/core/strings/accessor.py | 24 +++ pandas/core/strings/object_array.py | 45 ++---- pandas/tests/strings/test_find_replace.py | 171 ++++++++++++---------- 3 files changed, 131 insertions(+), 109 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 85a58d3d99795..82462f8d922d5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -19,6 +19,7 @@ is_categorical_dtype, is_integer, is_list_like, + is_re, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -1333,6 +1334,29 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) warnings.warn(msg, FutureWarning, stacklevel=3) regex = True + + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + elif case is None: + # not a compiled regex, set default case + case = True + + elif is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + elif callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index b794690ccc5af..c7e4368a98c95 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -147,41 +147,20 @@ def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - + def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True): is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) + + if case is False: + # add case flag, if provided + flags |= re.IGNORECASE + + if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)): + if not is_compiled_re: + pat = re.compile(pat, flags=flags) + + n = n if n >= 0 else 0 + f = lambda x: pat.sub(repl=repl, string=x, count=n) else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with " - "regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) return self._str_map(f, dtype=str) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 0c54042d983ad..3d33e34a9dcfe 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -266,144 +266,157 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): tm.assert_series_equal(result, exp) -def test_replace(): - values = Series(["fooBAD__barBAD", np.nan]) +def test_replace(any_string_dtype): + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) result = values.str.replace("BAD[_]*", "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobar", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) result = values.str.replace("BAD[_]*", "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_replace_mixed_object(): mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace("BAD[_]*", "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = Series(mixed).str.replace("BAD[_]*", "", regex=True) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(result, Series) + tm.assert_almost_equal(result, expected) - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + +def test_replace_unicode(any_string_dtype): + values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) + - # GH 13438 +@pytest.mark.parametrize("klass", [Series, Index]) +@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}]) +@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]]) +def test_replace_raises(any_string_dtype, klass, repl, data): + # https://github.com/pandas-dev/pandas/issues/13438 msg = "repl must be a string or callable" - for klass in (Series, Index): - for repl in (None, 3, {"a": "b"}): - for data in (["a", "b", None], ["a", "b", "c", "ad"]): - values = klass(data) - with pytest.raises(TypeError, match=msg): - values.str.replace("a", repl) + values = klass(data, dtype=any_string_dtype) + with pytest.raises(TypeError, match=msg): + values.str.replace("a", repl) -def test_replace_callable(): +def test_replace_callable(any_string_dtype): # GH 15055 - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None] +) +def test_replace_callable_raises(any_string_dtype, repl): + # GH 15055 + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with wrong number of arguments, raising an error - p_err = ( + msg = ( r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) - - repl = lambda: None - with pytest.raises(TypeError, match=p_err): + with pytest.raises(TypeError, match=msg): values.str.replace("a", repl) - repl = lambda m, x: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) - - repl = lambda m, x, y=None: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) +def test_replace_callable_named_groups(any_string_dtype): # test regex named groups - values = Series(["Foo Bar Baz", np.nan]) + values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl, regex=True) - exp = Series(["bAR", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["bAR", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) -def test_replace_compiled_regex(): +def test_replace_compiled_regex(any_string_dtype): # GH 15446 - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") result = values.str.replace(pat, "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobar", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) result = values.str.replace(pat, "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_replace_compiled_regex_mixed_object(): + pat = re.compile(r"BAD_*") mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace(pat, "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = Series(mixed).str.replace(pat, "", regex=True) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(result, Series) + tm.assert_almost_equal(result, expected) + - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) +def test_replace_compiled_regex_unicode(any_string_dtype): + values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = values.str.replace(pat, ", ") - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) + +def test_replace_compiled_regex_raises(any_string_dtype): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(["fooBAD__barBAD__bad", np.nan]) + values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) pat = re.compile(r"BAD_*") - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", flags=re.IGNORECASE) + msg = "case and flags cannot be set when pat is a compiled regex" - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=False) + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", flags=re.IGNORECASE) - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=True) + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", case=False) + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", case=True) + + +def test_replace_compiled_regex_callable(any_string_dtype): # test with callable - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) -def test_replace_literal(): +def test_replace_literal(any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) - values = Series(["f.o", "foo", np.nan]) - exp = Series(["bao", "bao", np.nan]) + values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) + expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype) result = values.str.replace("f.", "ba", regex=True) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) - exp = Series(["bao", "foo", np.nan]) + expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype) result = values.str.replace("f.", "ba", regex=False) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) # Cannot do a literal replace if given a callable repl or compiled # pattern @@ -680,13 +693,17 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) -def test_replace_moar(): +def test_replace_moar(any_string_dtype): # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) + s = Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype=any_string_dtype, + ) result = s.str.replace("A", "YYY") expected = Series( - ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] + ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) @@ -703,7 +720,8 @@ def test_replace_moar(): "CYYYBYYY", "dog", "cYYYt", - ] + ], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) @@ -720,7 +738,8 @@ def test_replace_moar(): "XX-XX BA", "XX-XX ", "XX-XX t", - ] + ], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected)