Skip to content

[ArrowStringArray] REF: pre-cursor to adding replace str accessor method #41293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is_categorical_dtype,
is_integer,
is_list_like,
is_re,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
Expand Down Expand Up @@ -1333,6 +1334,29 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
)
warnings.warn(msg, FutureWarning, stacklevel=3)
regex = True

# Check whether repl is valid (GH 13438, GH 15055)
if not (isinstance(repl, str) or callable(repl)):
raise TypeError("repl must be a string or callable")

is_compiled_re = is_re(pat)
if regex:
if is_compiled_re:
if (case is not None) or (flags != 0):
raise ValueError(
"case and flags cannot be set when pat is a compiled regex"
)
elif case is None:
# not a compiled regex, set default case
case = True

elif is_compiled_re:
raise ValueError(
"Cannot use a compiled regex as replacement pattern with regex=False"
)
elif callable(repl):
raise ValueError("Cannot use a callable replacement when regex=False")

result = self._data.array._str_replace(
pat, repl, n=n, case=case, flags=flags, regex=regex
)
Expand Down
45 changes: 12 additions & 33 deletions pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,41 +147,20 @@ def _str_endswith(self, pat, na=None):
f = lambda x: x.endswith(pat)
return self._str_map(f, na_value=na, dtype=np.dtype(bool))

def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
# Check whether repl is valid (GH 13438, GH 15055)
if not (isinstance(repl, str) or callable(repl)):
raise TypeError("repl must be a string or callable")

def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True):
is_compiled_re = is_re(pat)
if regex:
if is_compiled_re:
if (case is not None) or (flags != 0):
raise ValueError(
"case and flags cannot be set when pat is a compiled regex"
)
else:
# not a compiled regex
# set default case
if case is None:
case = True

# add case flag, if provided
if case is False:
flags |= re.IGNORECASE
if is_compiled_re or len(pat) > 1 or flags or callable(repl):
n = n if n >= 0 else 0
compiled = re.compile(pat, flags=flags)
f = lambda x: compiled.sub(repl=repl, string=x, count=n)
else:
f = lambda x: x.replace(pat, repl, n)

if case is False:
# add case flag, if provided
flags |= re.IGNORECASE

if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)):
if not is_compiled_re:
pat = re.compile(pat, flags=flags)

n = n if n >= 0 else 0
f = lambda x: pat.sub(repl=repl, string=x, count=n)
else:
if is_compiled_re:
raise ValueError(
"Cannot use a compiled regex as replacement pattern with "
"regex=False"
)
if callable(repl):
raise ValueError("Cannot use a callable replacement when regex=False")
f = lambda x: x.replace(pat, repl, n)

return self._str_map(f, dtype=str)
Expand Down
171 changes: 95 additions & 76 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,144 +266,157 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
tm.assert_series_equal(result, exp)


def test_replace():
values = Series(["fooBAD__barBAD", np.nan])
def test_replace(any_string_dtype):
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

result = values.str.replace("BAD[_]*", "", regex=True)
exp = Series(["foobar", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

result = values.str.replace("BAD[_]*", "", n=1, regex=True)
exp = Series(["foobarBAD", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

# mixed

def test_replace_mixed_object():
mixed = Series(
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
)

rs = Series(mixed).str.replace("BAD[_]*", "", regex=True)
xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
result = Series(mixed).str.replace("BAD[_]*", "", regex=True)
expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
assert isinstance(result, Series)
tm.assert_almost_equal(result, expected)

# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])

def test_replace_unicode(any_string_dtype):
values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
tm.assert_series_equal(result, exp)
tm.assert_series_equal(result, expected)


# GH 13438
@pytest.mark.parametrize("klass", [Series, Index])
@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
def test_replace_raises(any_string_dtype, klass, repl, data):
# https://github.com/pandas-dev/pandas/issues/13438
msg = "repl must be a string or callable"
for klass in (Series, Index):
for repl in (None, 3, {"a": "b"}):
for data in (["a", "b", None], ["a", "b", "c", "ad"]):
values = klass(data)
with pytest.raises(TypeError, match=msg):
values.str.replace("a", repl)
values = klass(data, dtype=any_string_dtype)
with pytest.raises(TypeError, match=msg):
values.str.replace("a", repl)


def test_replace_callable():
def test_replace_callable(any_string_dtype):
# GH 15055
values = Series(["fooBAD__barBAD", np.nan])
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

# test with callable
repl = lambda m: m.group(0).swapcase()
result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
exp = Series(["foObaD__baRbaD", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
)
def test_replace_callable_raises(any_string_dtype, repl):
# GH 15055
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

# test with wrong number of arguments, raising an error
p_err = (
msg = (
r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
r"(?(3)required )positional arguments?"
)

repl = lambda: None
with pytest.raises(TypeError, match=p_err):
with pytest.raises(TypeError, match=msg):
values.str.replace("a", repl)

repl = lambda m, x: None
with pytest.raises(TypeError, match=p_err):
values.str.replace("a", repl)

repl = lambda m, x, y=None: None
with pytest.raises(TypeError, match=p_err):
values.str.replace("a", repl)

def test_replace_callable_named_groups(any_string_dtype):
# test regex named groups
values = Series(["Foo Bar Baz", np.nan])
values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
repl = lambda m: m.group("middle").swapcase()
result = values.str.replace(pat, repl, regex=True)
exp = Series(["bAR", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["bAR", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


def test_replace_compiled_regex():
def test_replace_compiled_regex(any_string_dtype):
# GH 15446
values = Series(["fooBAD__barBAD", np.nan])
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

# test with compiled regex
pat = re.compile(r"BAD_*")
result = values.str.replace(pat, "", regex=True)
exp = Series(["foobar", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

result = values.str.replace(pat, "", n=1, regex=True)
exp = Series(["foobarBAD", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

# mixed

def test_replace_compiled_regex_mixed_object():
pat = re.compile(r"BAD_*")
mixed = Series(
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
)

rs = Series(mixed).str.replace(pat, "", regex=True)
xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)
result = Series(mixed).str.replace(pat, "", regex=True)
expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
assert isinstance(result, Series)
tm.assert_almost_equal(result, expected)


# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
def test_replace_compiled_regex_unicode(any_string_dtype):
values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
result = values.str.replace(pat, ", ")
tm.assert_series_equal(result, exp)
tm.assert_series_equal(result, expected)


def test_replace_compiled_regex_raises(any_string_dtype):
# case and flags provided to str.replace will have no effect
# and will produce warnings
values = Series(["fooBAD__barBAD__bad", np.nan])
values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
pat = re.compile(r"BAD_*")

with pytest.raises(ValueError, match="case and flags cannot be"):
result = values.str.replace(pat, "", flags=re.IGNORECASE)
msg = "case and flags cannot be set when pat is a compiled regex"

with pytest.raises(ValueError, match="case and flags cannot be"):
result = values.str.replace(pat, "", case=False)
with pytest.raises(ValueError, match=msg):
values.str.replace(pat, "", flags=re.IGNORECASE)

with pytest.raises(ValueError, match="case and flags cannot be"):
result = values.str.replace(pat, "", case=True)
with pytest.raises(ValueError, match=msg):
values.str.replace(pat, "", case=False)

with pytest.raises(ValueError, match=msg):
values.str.replace(pat, "", case=True)


def test_replace_compiled_regex_callable(any_string_dtype):
# test with callable
values = Series(["fooBAD__barBAD", np.nan])
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
repl = lambda m: m.group(0).swapcase()
pat = re.compile("[a-z][A-Z]{2}")
result = values.str.replace(pat, repl, n=2)
exp = Series(["foObaD__baRbaD", np.nan])
tm.assert_series_equal(result, exp)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


def test_replace_literal():
def test_replace_literal(any_string_dtype):
# GH16808 literal replace (regex=False vs regex=True)
values = Series(["f.o", "foo", np.nan])
exp = Series(["bao", "bao", np.nan])
values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype)
result = values.str.replace("f.", "ba", regex=True)
tm.assert_series_equal(result, exp)
tm.assert_series_equal(result, expected)

exp = Series(["bao", "foo", np.nan])
expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype)
result = values.str.replace("f.", "ba", regex=False)
tm.assert_series_equal(result, exp)
tm.assert_series_equal(result, expected)

# Cannot do a literal replace if given a callable repl or compiled
# pattern
Expand Down Expand Up @@ -680,13 +693,17 @@ def test_contains_nan(any_string_dtype):
tm.assert_series_equal(result, expected)


def test_replace_moar():
def test_replace_moar(any_string_dtype):
# PR #1179
s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
s = Series(
["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
dtype=any_string_dtype,
)

result = s.str.replace("A", "YYY")
expected = Series(
["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"]
["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
dtype=any_string_dtype,
)
tm.assert_series_equal(result, expected)

Expand All @@ -703,7 +720,8 @@ def test_replace_moar():
"CYYYBYYY",
"dog",
"cYYYt",
]
],
dtype=any_string_dtype,
)
tm.assert_series_equal(result, expected)

Expand All @@ -720,7 +738,8 @@ def test_replace_moar():
"XX-XX BA",
"XX-XX ",
"XX-XX t",
]
],
dtype=any_string_dtype,
)
tm.assert_series_equal(result, expected)

Expand Down