From a33339d5e7ab440bb11a9c5c15bd3b9818478869 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 3 May 2021 22:38:44 +0100
Subject: [PATCH] [ArrowStringArray] pre-cursor to adding replace str accessor
 method

---
 pandas/core/strings/accessor.py           |  24 +++
 pandas/core/strings/object_array.py       |  45 ++----
 pandas/tests/strings/test_find_replace.py | 171 ++++++++++++----------
 3 files changed, 131 insertions(+), 109 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 85a58d3d99795..82462f8d922d5 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -19,6 +19,7 @@
     is_categorical_dtype,
     is_integer,
     is_list_like,
+    is_re,
 )
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -1333,6 +1334,29 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
                     )
                 warnings.warn(msg, FutureWarning, stacklevel=3)
             regex = True
+
+        # Check whether repl is valid (GH 13438, GH 15055)
+        if not (isinstance(repl, str) or callable(repl)):
+            raise TypeError("repl must be a string or callable")
+
+        is_compiled_re = is_re(pat)
+        if regex:
+            if is_compiled_re:
+                if (case is not None) or (flags != 0):
+                    raise ValueError(
+                        "case and flags cannot be set when pat is a compiled regex"
+                    )
+            elif case is None:
+                # not a compiled regex, set default case
+                case = True
+
+        elif is_compiled_re:
+            raise ValueError(
+                "Cannot use a compiled regex as replacement pattern with regex=False"
+            )
+        elif callable(repl):
+            raise ValueError("Cannot use a callable replacement when regex=False")
+
         result = self._data.array._str_replace(
             pat, repl, n=n, case=case, flags=flags, regex=regex
         )
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index b794690ccc5af..c7e4368a98c95 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -147,41 +147,20 @@ def _str_endswith(self, pat, na=None):
         f = lambda x: x.endswith(pat)
         return self._str_map(f, na_value=na, dtype=np.dtype(bool))
 
-    def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
-        # Check whether repl is valid (GH 13438, GH 15055)
-        if not (isinstance(repl, str) or callable(repl)):
-            raise TypeError("repl must be a string or callable")
-
+    def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True):
         is_compiled_re = is_re(pat)
-        if regex:
-            if is_compiled_re:
-                if (case is not None) or (flags != 0):
-                    raise ValueError(
-                        "case and flags cannot be set when pat is a compiled regex"
-                    )
-            else:
-                # not a compiled regex
-                # set default case
-                if case is None:
-                    case = True
-
-                # add case flag, if provided
-                if case is False:
-                    flags |= re.IGNORECASE
-            if is_compiled_re or len(pat) > 1 or flags or callable(repl):
-                n = n if n >= 0 else 0
-                compiled = re.compile(pat, flags=flags)
-                f = lambda x: compiled.sub(repl=repl, string=x, count=n)
-            else:
-                f = lambda x: x.replace(pat, repl, n)
+
+        if case is False:
+            # add case flag, if provided
+            flags |= re.IGNORECASE
+
+        if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)):
+            if not is_compiled_re:
+                pat = re.compile(pat, flags=flags)
+
+            n = n if n >= 0 else 0
+            f = lambda x: pat.sub(repl=repl, string=x, count=n)
         else:
-            if is_compiled_re:
-                raise ValueError(
-                    "Cannot use a compiled regex as replacement pattern with "
-                    "regex=False"
-                )
-            if callable(repl):
-                raise ValueError("Cannot use a callable replacement when regex=False")
             f = lambda x: x.replace(pat, repl, n)
 
         return self._str_map(f, dtype=str)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 0c54042d983ad..3d33e34a9dcfe 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -266,144 +266,157 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
     tm.assert_series_equal(result, exp)
 
 
-def test_replace():
-    values = Series(["fooBAD__barBAD", np.nan])
+def test_replace(any_string_dtype):
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     result = values.str.replace("BAD[_]*", "", regex=True)
-    exp = Series(["foobar", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foobar", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
     result = values.str.replace("BAD[_]*", "", n=1, regex=True)
-    exp = Series(["foobarBAD", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
-    # mixed
+
+def test_replace_mixed_object():
     mixed = Series(
         ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
     )
 
-    rs = Series(mixed).str.replace("BAD[_]*", "", regex=True)
-    xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
-    assert isinstance(rs, Series)
-    tm.assert_almost_equal(rs, xp)
+    result = Series(mixed).str.replace("BAD[_]*", "", regex=True)
+    expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result, expected)
 
-    # flags + unicode
-    values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
-    exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
+
+def test_replace_unicode(any_string_dtype):
+    values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
-    tm.assert_series_equal(result, exp)
+    tm.assert_series_equal(result, expected)
+
 
-    # GH 13438
+@pytest.mark.parametrize("klass", [Series, Index])
+@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
+@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
+def test_replace_raises(any_string_dtype, klass, repl, data):
+    # https://github.com/pandas-dev/pandas/issues/13438
     msg = "repl must be a string or callable"
-    for klass in (Series, Index):
-        for repl in (None, 3, {"a": "b"}):
-            for data in (["a", "b", None], ["a", "b", "c", "ad"]):
-                values = klass(data)
-                with pytest.raises(TypeError, match=msg):
-                    values.str.replace("a", repl)
+    values = klass(data, dtype=any_string_dtype)
+    with pytest.raises(TypeError, match=msg):
+        values.str.replace("a", repl)
 
 
-def test_replace_callable():
+def test_replace_callable(any_string_dtype):
     # GH 15055
-    values = Series(["fooBAD__barBAD", np.nan])
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     # test with callable
     repl = lambda m: m.group(0).swapcase()
     result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
-    exp = Series(["foObaD__baRbaD", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
+)
+def test_replace_callable_raises(any_string_dtype, repl):
+    # GH 15055
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     # test with wrong number of arguments, raising an error
-    p_err = (
+    msg = (
         r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
         r"(?(3)required )positional arguments?"
     )
-
-    repl = lambda: None
-    with pytest.raises(TypeError, match=p_err):
+    with pytest.raises(TypeError, match=msg):
         values.str.replace("a", repl)
 
-    repl = lambda m, x: None
-    with pytest.raises(TypeError, match=p_err):
-        values.str.replace("a", repl)
-
-    repl = lambda m, x, y=None: None
-    with pytest.raises(TypeError, match=p_err):
-        values.str.replace("a", repl)
 
+def test_replace_callable_named_groups(any_string_dtype):
     # test regex named groups
-    values = Series(["Foo Bar Baz", np.nan])
+    values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
     pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
     repl = lambda m: m.group("middle").swapcase()
     result = values.str.replace(pat, repl, regex=True)
-    exp = Series(["bAR", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["bAR", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
 
-def test_replace_compiled_regex():
+def test_replace_compiled_regex(any_string_dtype):
     # GH 15446
-    values = Series(["fooBAD__barBAD", np.nan])
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     # test with compiled regex
     pat = re.compile(r"BAD_*")
     result = values.str.replace(pat, "", regex=True)
-    exp = Series(["foobar", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foobar", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
     result = values.str.replace(pat, "", n=1, regex=True)
-    exp = Series(["foobarBAD", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
-    # mixed
+
+def test_replace_compiled_regex_mixed_object():
+    pat = re.compile(r"BAD_*")
     mixed = Series(
         ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
     )
 
-    rs = Series(mixed).str.replace(pat, "", regex=True)
-    xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
-    assert isinstance(rs, Series)
-    tm.assert_almost_equal(rs, xp)
+    result = Series(mixed).str.replace(pat, "", regex=True)
+    expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result, expected)
+
 
-    # flags + unicode
-    values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
-    exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
+def test_replace_compiled_regex_unicode(any_string_dtype):
+    values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
     result = values.str.replace(pat, ", ")
-    tm.assert_series_equal(result, exp)
+    tm.assert_series_equal(result, expected)
 
+
+def test_replace_compiled_regex_raises(any_string_dtype):
     # case and flags provided to str.replace will have no effect
     # and will produce warnings
-    values = Series(["fooBAD__barBAD__bad", np.nan])
+    values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
     pat = re.compile(r"BAD_*")
 
-    with pytest.raises(ValueError, match="case and flags cannot be"):
-        result = values.str.replace(pat, "", flags=re.IGNORECASE)
+    msg = "case and flags cannot be set when pat is a compiled regex"
 
-    with pytest.raises(ValueError, match="case and flags cannot be"):
-        result = values.str.replace(pat, "", case=False)
+    with pytest.raises(ValueError, match=msg):
+        values.str.replace(pat, "", flags=re.IGNORECASE)
 
-    with pytest.raises(ValueError, match="case and flags cannot be"):
-        result = values.str.replace(pat, "", case=True)
+    with pytest.raises(ValueError, match=msg):
+        values.str.replace(pat, "", case=False)
 
+    with pytest.raises(ValueError, match=msg):
+        values.str.replace(pat, "", case=True)
+
+
+def test_replace_compiled_regex_callable(any_string_dtype):
     # test with callable
-    values = Series(["fooBAD__barBAD", np.nan])
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
     repl = lambda m: m.group(0).swapcase()
     pat = re.compile("[a-z][A-Z]{2}")
     result = values.str.replace(pat, repl, n=2)
-    exp = Series(["foObaD__baRbaD", np.nan])
-    tm.assert_series_equal(result, exp)
+    expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
 
 
-def test_replace_literal():
+def test_replace_literal(any_string_dtype):
     # GH16808 literal replace (regex=False vs regex=True)
-    values = Series(["f.o", "foo", np.nan])
-    exp = Series(["bao", "bao", np.nan])
+    values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
+    expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype)
     result = values.str.replace("f.", "ba", regex=True)
-    tm.assert_series_equal(result, exp)
+    tm.assert_series_equal(result, expected)
 
-    exp = Series(["bao", "foo", np.nan])
+    expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype)
     result = values.str.replace("f.", "ba", regex=False)
-    tm.assert_series_equal(result, exp)
+    tm.assert_series_equal(result, expected)
 
     # Cannot do a literal replace if given a callable repl or compiled
     # pattern
@@ -680,13 +693,17 @@ def test_contains_nan(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
 
-def test_replace_moar():
+def test_replace_moar(any_string_dtype):
     # PR #1179
-    s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
+    s = Series(
+        ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
+        dtype=any_string_dtype,
+    )
 
     result = s.str.replace("A", "YYY")
     expected = Series(
-        ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"]
+        ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
+        dtype=any_string_dtype,
     )
     tm.assert_series_equal(result, expected)
 
@@ -703,7 +720,8 @@ def test_replace_moar():
             "CYYYBYYY",
             "dog",
             "cYYYt",
-        ]
+        ],
+        dtype=any_string_dtype,
     )
     tm.assert_series_equal(result, expected)
 
@@ -720,7 +738,8 @@ def test_replace_moar():
             "XX-XX BA",
             "XX-XX ",
             "XX-XX t",
-        ]
+        ],
+        dtype=any_string_dtype,
     )
     tm.assert_series_equal(result, expected)