Skip to content

Commit 5272b56

Browse files
[ArrowStringArray] pre-cursor to adding replace str accessor method (#41293)
1 parent 7c89062 commit 5272b56

File tree

3 files changed

+131
-109
lines changed

3 files changed

+131
-109
lines changed

pandas/core/strings/accessor.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_categorical_dtype,
2020
is_integer,
2121
is_list_like,
22+
is_re,
2223
)
2324
from pandas.core.dtypes.generic import (
2425
ABCDataFrame,
@@ -1333,6 +1334,29 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
13331334
)
13341335
warnings.warn(msg, FutureWarning, stacklevel=3)
13351336
regex = True
1337+
1338+
# Check whether repl is valid (GH 13438, GH 15055)
1339+
if not (isinstance(repl, str) or callable(repl)):
1340+
raise TypeError("repl must be a string or callable")
1341+
1342+
is_compiled_re = is_re(pat)
1343+
if regex:
1344+
if is_compiled_re:
1345+
if (case is not None) or (flags != 0):
1346+
raise ValueError(
1347+
"case and flags cannot be set when pat is a compiled regex"
1348+
)
1349+
elif case is None:
1350+
# not a compiled regex, set default case
1351+
case = True
1352+
1353+
elif is_compiled_re:
1354+
raise ValueError(
1355+
"Cannot use a compiled regex as replacement pattern with regex=False"
1356+
)
1357+
elif callable(repl):
1358+
raise ValueError("Cannot use a callable replacement when regex=False")
1359+
13361360
result = self._data.array._str_replace(
13371361
pat, repl, n=n, case=case, flags=flags, regex=regex
13381362
)

pandas/core/strings/object_array.py

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -147,41 +147,20 @@ def _str_endswith(self, pat, na=None):
147147
f = lambda x: x.endswith(pat)
148148
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
149149

150-
def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
151-
# Check whether repl is valid (GH 13438, GH 15055)
152-
if not (isinstance(repl, str) or callable(repl)):
153-
raise TypeError("repl must be a string or callable")
154-
150+
def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True):
155151
is_compiled_re = is_re(pat)
156-
if regex:
157-
if is_compiled_re:
158-
if (case is not None) or (flags != 0):
159-
raise ValueError(
160-
"case and flags cannot be set when pat is a compiled regex"
161-
)
162-
else:
163-
# not a compiled regex
164-
# set default case
165-
if case is None:
166-
case = True
167-
168-
# add case flag, if provided
169-
if case is False:
170-
flags |= re.IGNORECASE
171-
if is_compiled_re or len(pat) > 1 or flags or callable(repl):
172-
n = n if n >= 0 else 0
173-
compiled = re.compile(pat, flags=flags)
174-
f = lambda x: compiled.sub(repl=repl, string=x, count=n)
175-
else:
176-
f = lambda x: x.replace(pat, repl, n)
152+
153+
if case is False:
154+
# add case flag, if provided
155+
flags |= re.IGNORECASE
156+
157+
if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)):
158+
if not is_compiled_re:
159+
pat = re.compile(pat, flags=flags)
160+
161+
n = n if n >= 0 else 0
162+
f = lambda x: pat.sub(repl=repl, string=x, count=n)
177163
else:
178-
if is_compiled_re:
179-
raise ValueError(
180-
"Cannot use a compiled regex as replacement pattern with "
181-
"regex=False"
182-
)
183-
if callable(repl):
184-
raise ValueError("Cannot use a callable replacement when regex=False")
185164
f = lambda x: x.replace(pat, repl, n)
186165

187166
return self._str_map(f, dtype=str)

pandas/tests/strings/test_find_replace.py

Lines changed: 95 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -266,144 +266,157 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
266266
tm.assert_series_equal(result, exp)
267267

268268

269-
def test_replace():
270-
values = Series(["fooBAD__barBAD", np.nan])
269+
def test_replace(any_string_dtype):
270+
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
271271

272272
result = values.str.replace("BAD[_]*", "", regex=True)
273-
exp = Series(["foobar", np.nan])
274-
tm.assert_series_equal(result, exp)
273+
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
274+
tm.assert_series_equal(result, expected)
275275

276276
result = values.str.replace("BAD[_]*", "", n=1, regex=True)
277-
exp = Series(["foobarBAD", np.nan])
278-
tm.assert_series_equal(result, exp)
277+
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
278+
tm.assert_series_equal(result, expected)
279279

280-
# mixed
280+
281+
def test_replace_mixed_object():
281282
mixed = Series(
282283
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
283284
)
284285

285-
rs = Series(mixed).str.replace("BAD[_]*", "", regex=True)
286-
xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
287-
assert isinstance(rs, Series)
288-
tm.assert_almost_equal(rs, xp)
286+
result = Series(mixed).str.replace("BAD[_]*", "", regex=True)
287+
expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
288+
assert isinstance(result, Series)
289+
tm.assert_almost_equal(result, expected)
289290

290-
# flags + unicode
291-
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
292-
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
291+
292+
def test_replace_unicode(any_string_dtype):
293+
values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
294+
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
293295
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
294-
tm.assert_series_equal(result, exp)
296+
tm.assert_series_equal(result, expected)
297+
295298

296-
# GH 13438
299+
@pytest.mark.parametrize("klass", [Series, Index])
300+
@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
301+
@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
302+
def test_replace_raises(any_string_dtype, klass, repl, data):
303+
# https://github.com/pandas-dev/pandas/issues/13438
297304
msg = "repl must be a string or callable"
298-
for klass in (Series, Index):
299-
for repl in (None, 3, {"a": "b"}):
300-
for data in (["a", "b", None], ["a", "b", "c", "ad"]):
301-
values = klass(data)
302-
with pytest.raises(TypeError, match=msg):
303-
values.str.replace("a", repl)
305+
values = klass(data, dtype=any_string_dtype)
306+
with pytest.raises(TypeError, match=msg):
307+
values.str.replace("a", repl)
304308

305309

306-
def test_replace_callable():
310+
def test_replace_callable(any_string_dtype):
307311
# GH 15055
308-
values = Series(["fooBAD__barBAD", np.nan])
312+
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
309313

310314
# test with callable
311315
repl = lambda m: m.group(0).swapcase()
312316
result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
313-
exp = Series(["foObaD__baRbaD", np.nan])
314-
tm.assert_series_equal(result, exp)
317+
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
318+
tm.assert_series_equal(result, expected)
319+
320+
321+
@pytest.mark.parametrize(
322+
"repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
323+
)
324+
def test_replace_callable_raises(any_string_dtype, repl):
325+
# GH 15055
326+
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
315327

316328
# test with wrong number of arguments, raising an error
317-
p_err = (
329+
msg = (
318330
r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
319331
r"(?(3)required )positional arguments?"
320332
)
321-
322-
repl = lambda: None
323-
with pytest.raises(TypeError, match=p_err):
333+
with pytest.raises(TypeError, match=msg):
324334
values.str.replace("a", repl)
325335

326-
repl = lambda m, x: None
327-
with pytest.raises(TypeError, match=p_err):
328-
values.str.replace("a", repl)
329-
330-
repl = lambda m, x, y=None: None
331-
with pytest.raises(TypeError, match=p_err):
332-
values.str.replace("a", repl)
333336

337+
def test_replace_callable_named_groups(any_string_dtype):
334338
# test regex named groups
335-
values = Series(["Foo Bar Baz", np.nan])
339+
values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
336340
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
337341
repl = lambda m: m.group("middle").swapcase()
338342
result = values.str.replace(pat, repl, regex=True)
339-
exp = Series(["bAR", np.nan])
340-
tm.assert_series_equal(result, exp)
343+
expected = Series(["bAR", np.nan], dtype=any_string_dtype)
344+
tm.assert_series_equal(result, expected)
341345

342346

343-
def test_replace_compiled_regex():
347+
def test_replace_compiled_regex(any_string_dtype):
344348
# GH 15446
345-
values = Series(["fooBAD__barBAD", np.nan])
349+
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
346350

347351
# test with compiled regex
348352
pat = re.compile(r"BAD_*")
349353
result = values.str.replace(pat, "", regex=True)
350-
exp = Series(["foobar", np.nan])
351-
tm.assert_series_equal(result, exp)
354+
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
355+
tm.assert_series_equal(result, expected)
352356

353357
result = values.str.replace(pat, "", n=1, regex=True)
354-
exp = Series(["foobarBAD", np.nan])
355-
tm.assert_series_equal(result, exp)
358+
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
359+
tm.assert_series_equal(result, expected)
356360

357-
# mixed
361+
362+
def test_replace_compiled_regex_mixed_object():
363+
pat = re.compile(r"BAD_*")
358364
mixed = Series(
359365
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
360366
)
361367

362-
rs = Series(mixed).str.replace(pat, "", regex=True)
363-
xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
364-
assert isinstance(rs, Series)
365-
tm.assert_almost_equal(rs, xp)
368+
result = Series(mixed).str.replace(pat, "", regex=True)
369+
expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
370+
assert isinstance(result, Series)
371+
tm.assert_almost_equal(result, expected)
372+
366373

367-
# flags + unicode
368-
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
369-
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
374+
def test_replace_compiled_regex_unicode(any_string_dtype):
375+
values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
376+
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
370377
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
371378
result = values.str.replace(pat, ", ")
372-
tm.assert_series_equal(result, exp)
379+
tm.assert_series_equal(result, expected)
373380

381+
382+
def test_replace_compiled_regex_raises(any_string_dtype):
374383
# case and flags provided to str.replace will have no effect
375384
# and will produce warnings
376-
values = Series(["fooBAD__barBAD__bad", np.nan])
385+
values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
377386
pat = re.compile(r"BAD_*")
378387

379-
with pytest.raises(ValueError, match="case and flags cannot be"):
380-
result = values.str.replace(pat, "", flags=re.IGNORECASE)
388+
msg = "case and flags cannot be set when pat is a compiled regex"
381389

382-
with pytest.raises(ValueError, match="case and flags cannot be"):
383-
result = values.str.replace(pat, "", case=False)
390+
with pytest.raises(ValueError, match=msg):
391+
values.str.replace(pat, "", flags=re.IGNORECASE)
384392

385-
with pytest.raises(ValueError, match="case and flags cannot be"):
386-
result = values.str.replace(pat, "", case=True)
393+
with pytest.raises(ValueError, match=msg):
394+
values.str.replace(pat, "", case=False)
387395

396+
with pytest.raises(ValueError, match=msg):
397+
values.str.replace(pat, "", case=True)
398+
399+
400+
def test_replace_compiled_regex_callable(any_string_dtype):
388401
# test with callable
389-
values = Series(["fooBAD__barBAD", np.nan])
402+
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
390403
repl = lambda m: m.group(0).swapcase()
391404
pat = re.compile("[a-z][A-Z]{2}")
392405
result = values.str.replace(pat, repl, n=2)
393-
exp = Series(["foObaD__baRbaD", np.nan])
394-
tm.assert_series_equal(result, exp)
406+
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
407+
tm.assert_series_equal(result, expected)
395408

396409

397-
def test_replace_literal():
410+
def test_replace_literal(any_string_dtype):
398411
# GH16808 literal replace (regex=False vs regex=True)
399-
values = Series(["f.o", "foo", np.nan])
400-
exp = Series(["bao", "bao", np.nan])
412+
values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
413+
expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype)
401414
result = values.str.replace("f.", "ba", regex=True)
402-
tm.assert_series_equal(result, exp)
415+
tm.assert_series_equal(result, expected)
403416

404-
exp = Series(["bao", "foo", np.nan])
417+
expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype)
405418
result = values.str.replace("f.", "ba", regex=False)
406-
tm.assert_series_equal(result, exp)
419+
tm.assert_series_equal(result, expected)
407420

408421
# Cannot do a literal replace if given a callable repl or compiled
409422
# pattern
@@ -680,13 +693,17 @@ def test_contains_nan(any_string_dtype):
680693
tm.assert_series_equal(result, expected)
681694

682695

683-
def test_replace_moar():
696+
def test_replace_moar(any_string_dtype):
684697
# PR #1179
685-
s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"])
698+
s = Series(
699+
["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
700+
dtype=any_string_dtype,
701+
)
686702

687703
result = s.str.replace("A", "YYY")
688704
expected = Series(
689-
["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"]
705+
["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
706+
dtype=any_string_dtype,
690707
)
691708
tm.assert_series_equal(result, expected)
692709

@@ -703,7 +720,8 @@ def test_replace_moar():
703720
"CYYYBYYY",
704721
"dog",
705722
"cYYYt",
706-
]
723+
],
724+
dtype=any_string_dtype,
707725
)
708726
tm.assert_series_equal(result, expected)
709727

@@ -720,7 +738,8 @@ def test_replace_moar():
720738
"XX-XX BA",
721739
"XX-XX ",
722740
"XX-XX t",
723-
]
741+
],
742+
dtype=any_string_dtype,
724743
)
725744
tm.assert_series_equal(result, expected)
726745

0 commit comments

Comments
 (0)