ENH: Add regex=True flag to str_contains

unutbu · unutbu · commit 7427288f09be · 2014-01-14T16:52:28.000-05:00
Using regex=False can be faster when full regex searching is not needed. See http://stackoverflow.com/q/20951840/190597 TST: Add a test for str_contains with regex=False BUG: Not all strings.str_* functions return an object with an ndim attribute. PERF: add benchmarks for every str method
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -71,6 +71,7 @@ Improvements to existing features
   - `option_context` context manager now available as top-level API (:issue:`5752`)
   - df.info() view now display dtype info per column (:issue: `5682`)
   - perf improvements in DataFrame ``count/dropna`` for ``axis=1``
+  - Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue: `5879`)
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -148,7 +148,7 @@ def str_count(arr, pat, flags=0):
     return _na_map(f, arr)
 
 
-def str_contains(arr, pat, case=True, flags=0, na=np.nan):
+def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
     """
     Check whether given pattern is contained in each string in the array
 
@@ -161,7 +161,9 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
     flags : int, default 0 (no flags)
         re module flags, e.g. re.IGNORECASE
     na : default NaN, fill value for missing values.
-
+    regex : bool, default True
+        If True use re.search, otherwise use Python in operator
+        
     Returns
     -------
     Series of boolean values
@@ -171,17 +173,21 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
     match : analagous, but stricter, relying on re.match instead of re.search
 
     """
-    if not case:
-        flags |= re.IGNORECASE
+    if regex:
+        if not case:
+            flags |= re.IGNORECASE
 
-    regex = re.compile(pat, flags=flags)
+        regex = re.compile(pat, flags=flags)
 
-    if regex.groups > 0:
-        warnings.warn("This pattern has match groups. To actually get the"
-                      " groups, use str.extract.", UserWarning)
+        if regex.groups > 0:
+            warnings.warn("This pattern has match groups. To actually get the"
+                          " groups, use str.extract.", UserWarning)
 
-    f = lambda x: bool(regex.search(x))
+        f = lambda x: bool(regex.search(x))
+    else:
+        f = lambda x: pat in x
     return _na_map(f, arr, na)
+        
 
 
 def str_startswith(arr, pat, na=np.nan):
@@ -816,11 +822,13 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(self, result):
-        assert result.ndim < 3
-        if result.ndim == 1:
+        if not hasattr(result, 'ndim'):
+            return result
+        elif result.ndim == 1:
             return Series(result, index=self.series.index,
                           name=self.series.name)
         else:
+            assert result.ndim < 3
             return DataFrame(result, index=self.series.index)
 
     @copy(str_cat)
@@ -844,11 +852,11 @@ def join(self, sep):
         return self._wrap_result(result)
 
     @copy(str_contains)
-    def contains(self, pat, case=True, flags=0, na=np.nan):
+    def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         result = str_contains(self.series, pat, case=case, flags=flags,
-                              na=na)
+                              na=na, regex=regex)
         return self._wrap_result(result)
-
+            
     @copy(str_replace)
     def replace(self, pat, repl, n=-1, case=True, flags=0):
         result = str_replace(self.series, pat, repl, n=n, case=case,
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -167,11 +167,15 @@ def test_count(self):
         tm.assert_almost_equal(result, exp)
 
     def test_contains(self):
-        values = ['foo', NA, 'fooommm__foo', 'mmm_']
+        values = ['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar']
         pat = 'mmm[_]+'
 
         result = strings.str_contains(values, pat)
-        expected = [False, np.nan, True, True]
+        expected = [False, NA, True, True, False]
+        tm.assert_almost_equal(result, expected)
+
+        result = strings.str_contains(values, pat, regex=False)
+        expected = [False, NA, False, False, True]
         tm.assert_almost_equal(result, expected)
 
         values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
diff --git a/vb_suite/strings.py b/vb_suite/strings.py
@@ -0,0 +1,53 @@
+from vbench.api import Benchmark
+
+common_setup = """from pandas_vb_common import *
+"""
+
+setup = common_setup + """
+import string
+import itertools as IT
+
+def make_series(letters, strlen, size):
+    return Series(
+        np.fromiter(IT.cycle(letters), count=size*strlen, dtype='|S1')
+        .view('|S{}'.format(strlen)))
+
+many = make_series('matchthis'+string.uppercase, strlen=19, size=10000) # 31% matches
+few = make_series('matchthis'+string.uppercase*42, strlen=19, size=10000) # 1% matches
+"""
+
+strings_cat = Benchmark("many.str.cat(sep=',')", setup)
+strings_title = Benchmark("many.str.title()", setup)
+strings_count = Benchmark("many.str.count('matchthis')", setup)
+strings_contains_many = Benchmark("many.str.contains('matchthis')", setup)
+strings_contains_few = Benchmark("few.str.contains('matchthis')", setup)
+strings_contains_many_noregex = Benchmark(
+    "many.str.contains('matchthis', regex=False)", setup)
+strings_contains_few_noregex = Benchmark(
+    "few.str.contains('matchthis', regex=False)", setup)
+strings_startswith = Benchmark("many.str.startswith('matchthis')", setup)
+strings_endswith = Benchmark("many.str.endswith('matchthis')", setup)
+strings_lower = Benchmark("many.str.lower()", setup)
+strings_upper = Benchmark("many.str.upper()", setup)
+strings_replace = Benchmark("many.str.replace(r'(matchthis)', r'\1\1')", setup)
+strings_repeat = Benchmark(
+    "many.str.repeat(list(IT.islice(IT.cycle(range(1,4)),len(many))))", setup)
+strings_match = Benchmark("many.str.match(r'mat..this')", setup)
+strings_extract = Benchmark("many.str.extract(r'(\w*)matchthis(\w*)')", setup)
+strings_join_split = Benchmark("many.str.join(r'--').str.split('--')", setup)
+strings_len = Benchmark("many.str.len()", setup)
+strings_findall = Benchmark("many.str.findall(r'[A-Z]+')", setup)
+strings_pad = Benchmark("many.str.pad(100, side='both')", setup)
+strings_center = Benchmark("many.str.center(100)", setup)
+strings_slice = Benchmark("many.str.slice(5,15,2)", setup)
+strings_strip = Benchmark("many.str.strip('matchthis')", setup)
+strings_lstrip = Benchmark("many.str.lstrip('matchthis')", setup)
+strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup)
+strings_get = Benchmark("many.str.get(0)", setup)
+
+setup = common_setup + """
+import pandas.util.testing as testing
+ser = pd.Series(testing.makeUnicodeIndex())
+"""
+
+strings_encode_decode = Benchmark("ser.str.encode('utf-8').str.decode('utf-8')", setup)
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
@@ -22,6 +22,7 @@
            'reindex',
            'replace',
            'sparse',
+           'strings',           
            'reshape',
            'stat_ops',
            'timeseries',