Skip to content

Commit 7427288

Browse files
committed
ENH: Add regex=True flag to str_contains
Using regex=False can be faster when full regex searching is not needed. See http://stackoverflow.com/q/20951840/190597 TST: Add a test for str_contains with regex=False BUG: Not all strings.str_* functions return an object with an ndim attribute. PERF: add benchmarks for every str method
1 parent a60887d commit 7427288

File tree

5 files changed

+83
-16
lines changed

5 files changed

+83
-16
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ Improvements to existing features
7171
- `option_context` context manager now available as top-level API (:issue:`5752`)
7272
- df.info() view now display dtype info per column (:issue: `5682`)
7373
- perf improvements in DataFrame ``count/dropna`` for ``axis=1``
74+
- Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue: `5879`)
7475

7576
Bug Fixes
7677
~~~~~~~~~

pandas/core/strings.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def str_count(arr, pat, flags=0):
148148
return _na_map(f, arr)
149149

150150

151-
def str_contains(arr, pat, case=True, flags=0, na=np.nan):
151+
def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
152152
"""
153153
Check whether given pattern is contained in each string in the array
154154
@@ -161,7 +161,9 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
161161
flags : int, default 0 (no flags)
162162
re module flags, e.g. re.IGNORECASE
163163
na : default NaN, fill value for missing values.
164-
164+
regex : bool, default True
165+
If True use re.search, otherwise use Python in operator
166+
165167
Returns
166168
-------
167169
Series of boolean values
@@ -171,17 +173,21 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
171173
match : analagous, but stricter, relying on re.match instead of re.search
172174
173175
"""
174-
if not case:
175-
flags |= re.IGNORECASE
176+
if regex:
177+
if not case:
178+
flags |= re.IGNORECASE
176179

177-
regex = re.compile(pat, flags=flags)
180+
regex = re.compile(pat, flags=flags)
178181

179-
if regex.groups > 0:
180-
warnings.warn("This pattern has match groups. To actually get the"
181-
" groups, use str.extract.", UserWarning)
182+
if regex.groups > 0:
183+
warnings.warn("This pattern has match groups. To actually get the"
184+
" groups, use str.extract.", UserWarning)
182185

183-
f = lambda x: bool(regex.search(x))
186+
f = lambda x: bool(regex.search(x))
187+
else:
188+
f = lambda x: pat in x
184189
return _na_map(f, arr, na)
190+
185191

186192

187193
def str_startswith(arr, pat, na=np.nan):
@@ -816,11 +822,13 @@ def __iter__(self):
816822
g = self.get(i)
817823

818824
def _wrap_result(self, result):
819-
assert result.ndim < 3
820-
if result.ndim == 1:
825+
if not hasattr(result, 'ndim'):
826+
return result
827+
elif result.ndim == 1:
821828
return Series(result, index=self.series.index,
822829
name=self.series.name)
823830
else:
831+
assert result.ndim < 3
824832
return DataFrame(result, index=self.series.index)
825833

826834
@copy(str_cat)
@@ -844,11 +852,11 @@ def join(self, sep):
844852
return self._wrap_result(result)
845853

846854
@copy(str_contains)
847-
def contains(self, pat, case=True, flags=0, na=np.nan):
855+
def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
848856
result = str_contains(self.series, pat, case=case, flags=flags,
849-
na=na)
857+
na=na, regex=regex)
850858
return self._wrap_result(result)
851-
859+
852860
@copy(str_replace)
853861
def replace(self, pat, repl, n=-1, case=True, flags=0):
854862
result = str_replace(self.series, pat, repl, n=n, case=case,

pandas/tests/test_strings.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,15 @@ def test_count(self):
167167
tm.assert_almost_equal(result, exp)
168168

169169
def test_contains(self):
170-
values = ['foo', NA, 'fooommm__foo', 'mmm_']
170+
values = ['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar']
171171
pat = 'mmm[_]+'
172172

173173
result = strings.str_contains(values, pat)
174-
expected = [False, np.nan, True, True]
174+
expected = [False, NA, True, True, False]
175+
tm.assert_almost_equal(result, expected)
176+
177+
result = strings.str_contains(values, pat, regex=False)
178+
expected = [False, NA, False, False, True]
175179
tm.assert_almost_equal(result, expected)
176180

177181
values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']

vb_suite/strings.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from vbench.api import Benchmark
2+
3+
common_setup = """from pandas_vb_common import *
4+
"""
5+
6+
setup = common_setup + """
7+
import string
8+
import itertools as IT
9+
10+
def make_series(letters, strlen, size):
11+
return Series(
12+
np.fromiter(IT.cycle(letters), count=size*strlen, dtype='|S1')
13+
.view('|S{}'.format(strlen)))
14+
15+
many = make_series('matchthis'+string.uppercase, strlen=19, size=10000) # 31% matches
16+
few = make_series('matchthis'+string.uppercase*42, strlen=19, size=10000) # 1% matches
17+
"""
18+
19+
strings_cat = Benchmark("many.str.cat(sep=',')", setup)
20+
strings_title = Benchmark("many.str.title()", setup)
21+
strings_count = Benchmark("many.str.count('matchthis')", setup)
22+
strings_contains_many = Benchmark("many.str.contains('matchthis')", setup)
23+
strings_contains_few = Benchmark("few.str.contains('matchthis')", setup)
24+
strings_contains_many_noregex = Benchmark(
25+
"many.str.contains('matchthis', regex=False)", setup)
26+
strings_contains_few_noregex = Benchmark(
27+
"few.str.contains('matchthis', regex=False)", setup)
28+
strings_startswith = Benchmark("many.str.startswith('matchthis')", setup)
29+
strings_endswith = Benchmark("many.str.endswith('matchthis')", setup)
30+
strings_lower = Benchmark("many.str.lower()", setup)
31+
strings_upper = Benchmark("many.str.upper()", setup)
32+
strings_replace = Benchmark("many.str.replace(r'(matchthis)', r'\1\1')", setup)
33+
strings_repeat = Benchmark(
34+
"many.str.repeat(list(IT.islice(IT.cycle(range(1,4)),len(many))))", setup)
35+
strings_match = Benchmark("many.str.match(r'mat..this')", setup)
36+
strings_extract = Benchmark("many.str.extract(r'(\w*)matchthis(\w*)')", setup)
37+
strings_join_split = Benchmark("many.str.join(r'--').str.split('--')", setup)
38+
strings_len = Benchmark("many.str.len()", setup)
39+
strings_findall = Benchmark("many.str.findall(r'[A-Z]+')", setup)
40+
strings_pad = Benchmark("many.str.pad(100, side='both')", setup)
41+
strings_center = Benchmark("many.str.center(100)", setup)
42+
strings_slice = Benchmark("many.str.slice(5,15,2)", setup)
43+
strings_strip = Benchmark("many.str.strip('matchthis')", setup)
44+
strings_lstrip = Benchmark("many.str.lstrip('matchthis')", setup)
45+
strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup)
46+
strings_get = Benchmark("many.str.get(0)", setup)
47+
48+
setup = common_setup + """
49+
import pandas.util.testing as testing
50+
ser = pd.Series(testing.makeUnicodeIndex())
51+
"""
52+
53+
strings_encode_decode = Benchmark("ser.str.encode('utf-8').str.decode('utf-8')", setup)

vb_suite/suite.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
'reindex',
2323
'replace',
2424
'sparse',
25+
'strings',
2526
'reshape',
2627
'stat_ops',
2728
'timeseries',

0 commit comments

Comments
 (0)