BUG StringMethods on empty series (GH7242)

wabu · hayd · commit f24f2e81e702 · 2014-06-03T20:31:58.000-07:00
- all StringMethods are tested and work on empty seires
- moreover extract always returns dtype==object, even when no match is
  found
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1165,6 +1165,9 @@ Thus, a Series of messy strings can be "converted" into a
 like-indexed Series or DataFrame of cleaned-up or more useful strings,
 without necessitating ``get()`` to access tuples or ``re.match`` objects.
 
+The results dtype always is object, even if no match is found and the result
+only contains ``NaN``.
+
 Named groups like
 
 .. ipython:: python
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -30,6 +30,10 @@ API changes
 - Openpyxl now raises a ValueError on construction of the openpyxl writer
   instead of warning on pandas import (:issue:`7284`).
 
+- For ``StringMethods.extract``, when no match is found, the result - only
+  containing ``NaN`` values - now also has ``dtype=object`` instead of
+  ``float`` (:issue:`7242`)
+
 .. _whatsnew_0141.prior_deprecations:
 
 Prior Version Deprecations/Changes
@@ -90,3 +94,4 @@ Bug Fixes
 - Bug in broadcasting with ``.div``, integer dtypes and divide-by-zero (:issue:`7325`)
 - Bug in ``CustomBusinessDay.apply`` raiases ``NameError`` when ``np.datetime64`` object is passed (:issue:`7196`)
 - Bug in ``MultiIndex.append``, ``concat`` and ``pivot_table`` don't preserve timezone (:issue:`6606`)
+- Bug all ``StringMethods`` now work on empty Series (:issue:`7242`)
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -12,7 +12,7 @@
 
 
 def _get_array_list(arr, others):
-    if isinstance(others[0], (list, np.ndarray)):
+    if len(others) and isinstance(others[0], (list, np.ndarray)):
         arrays = [arr] + list(others)
     else:
         arrays = [arr, others]
@@ -88,12 +88,15 @@ def _length_check(others):
     return n
 
 
-def _na_map(f, arr, na_result=np.nan):
+def _na_map(f, arr, na_result=np.nan, dtype=object):
     # should really _check_ for NA
-    return _map(f, arr, na_mask=True, na_value=na_result)
+    return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
 
 
-def _map(f, arr, na_mask=False, na_value=np.nan):
+def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
+    if not len(arr):
+        return np.ndarray(0, dtype=dtype)
+
     if isinstance(arr, Series):
         arr = arr.values
     if not isinstance(arr, np.ndarray):
@@ -108,7 +111,7 @@ def g(x):
                     return f(x)
                 except (TypeError, AttributeError):
                     return na_value
-            return _map(g, arr)
+            return _map(g, arr, dtype=dtype)
         if na_value is not np.nan:
             np.putmask(result, mask, na_value)
             if result.dtype == object:
@@ -146,7 +149,7 @@ def str_count(arr, pat, flags=0):
     """
     regex = re.compile(pat, flags=flags)
     f = lambda x: len(regex.findall(x))
-    return _na_map(f, arr)
+    return _na_map(f, arr, dtype=int)
 
 
 def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
@@ -187,7 +190,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
         f = lambda x: bool(regex.search(x))
     else:
         f = lambda x: pat in x
-    return _na_map(f, arr, na)
+    return _na_map(f, arr, na, dtype=bool)
 
 
 def str_startswith(arr, pat, na=np.nan):
@@ -206,7 +209,7 @@ def str_startswith(arr, pat, na=np.nan):
     startswith : array (boolean)
     """
     f = lambda x: x.startswith(pat)
-    return _na_map(f, arr, na)
+    return _na_map(f, arr, na, dtype=bool)
 
 
 def str_endswith(arr, pat, na=np.nan):
@@ -225,7 +228,7 @@ def str_endswith(arr, pat, na=np.nan):
     endswith : array (boolean)
     """
     f = lambda x: x.endswith(pat)
-    return _na_map(f, arr, na)
+    return _na_map(f, arr, na, dtype=bool)
 
 
 def str_lower(arr):
@@ -375,6 +378,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
     # and is basically useless, so we will not warn.
 
     if (not as_indexer) and regex.groups > 0:
+        dtype = object
         def f(x):
             m = regex.match(x)
             if m:
@@ -383,9 +387,10 @@ def f(x):
                 return []
     else:
         # This is the new behavior of str_match.
+        dtype = bool
         f = lambda x: bool(regex.match(x))
 
-    return _na_map(f, arr, na)
+    return _na_map(f, arr, na, dtype=dtype)
 
 
 def _get_single_group_name(rx):
@@ -409,6 +414,9 @@ def str_extract(arr, pat, flags=0):
     Returns
     -------
     extracted groups : Series (one group) or DataFrame (multiple groups)
+        Note that dtype of the result is always object, even when no match is
+        found and the result is a Series or DataFrame containing only NaN
+        values.
 
     Examples
     --------
@@ -461,13 +469,17 @@ def f(x):
     if regex.groups == 1:
         result = Series([f(val)[0] for val in arr],
                         name=_get_single_group_name(regex),
-                        index=arr.index)
+                        index=arr.index, dtype=object)
     else:
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
         columns = [names.get(1 + i, i) for i in range(regex.groups)]
-        result = DataFrame([f(val) for val in arr],
-                           columns=columns,
-                           index=arr.index)
+        if arr.empty:
+            result = DataFrame(columns=columns, dtype=object)
+        else:
+            result = DataFrame([f(val) for val in arr],
+                               columns=columns,
+                               index=arr.index,
+                               dtype=object)
     return result
 
 
@@ -536,7 +548,7 @@ def str_len(arr):
     -------
     lengths : array
     """
-    return _na_map(len, arr)
+    return _na_map(len, arr, dtype=int)
 
 
 def str_findall(arr, pat, flags=0):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -505,12 +505,12 @@ def test_extract(self):
 
         # one group, no matches
         result = s.str.extract('(_)')
-        exp = Series([NA, NA, NA])
+        exp = Series([NA, NA, NA], dtype=object)
         tm.assert_series_equal(result, exp)
 
         # two groups, no matches
         result = s.str.extract('(_)(_)')
-        exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]])
+        exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
         tm.assert_frame_equal(result, exp)
 
         # one group, some matches
@@ -585,6 +585,47 @@ def test_extract_single_series_name_is_preserved(self):
         tm.assert_series_equal(r, e)
         self.assertEqual(r.name, e.name)
 
+    def test_empty_str_methods(self):
+        empty_str = empty = Series(dtype=str)
+        empty_int = Series(dtype=int)
+        empty_bool = Series(dtype=bool)
+        empty_list = Series(dtype=list)
+        empty_bytes = Series(dtype=object)
+
+        # GH7241
+        # (extract) on empty series
+
+        tm.assert_series_equal(empty_str, empty.str.cat(empty))
+        tm.assert_equal('', empty.str.cat())
+        tm.assert_series_equal(empty_str, empty.str.title())
+        tm.assert_series_equal(empty_int, empty.str.count('a'))
+        tm.assert_series_equal(empty_bool, empty.str.contains('a'))
+        tm.assert_series_equal(empty_bool, empty.str.startswith('a'))
+        tm.assert_series_equal(empty_bool, empty.str.endswith('a'))
+        tm.assert_series_equal(empty_str, empty.str.lower())
+        tm.assert_series_equal(empty_str, empty.str.upper())
+        tm.assert_series_equal(empty_str, empty.str.replace('a','b'))
+        tm.assert_series_equal(empty_str, empty.str.repeat(3))
+        tm.assert_series_equal(empty_bool, empty.str.match('^a'))
+        tm.assert_series_equal(empty_str, empty.str.extract('()'))
+        tm.assert_frame_equal(DataFrame(columns=[0,1], dtype=str), empty.str.extract('()()'))
+        tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
+        tm.assert_series_equal(empty_str, empty_list.str.join(''))
+        tm.assert_series_equal(empty_int, empty.str.len())
+        tm.assert_series_equal(empty_list, empty_list.str.findall('a'))
+        tm.assert_series_equal(empty_str, empty.str.pad(42))
+        tm.assert_series_equal(empty_str, empty.str.center(42))
+        tm.assert_series_equal(empty_list, empty.str.split('a'))
+        tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
+        tm.assert_series_equal(empty_str, empty.str.strip())
+        tm.assert_series_equal(empty_str, empty.str.lstrip())
+        tm.assert_series_equal(empty_str, empty.str.rstrip())
+        tm.assert_series_equal(empty_str, empty.str.rstrip())
+        tm.assert_series_equal(empty_str, empty.str.wrap(42))
+        tm.assert_series_equal(empty_str, empty.str.get(0))
+        tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii'))
+        tm.assert_series_equal(empty_bytes, empty.str.encode('ascii'))
+
     def test_get_dummies(self):
         s = Series(['a|b', 'a|c', np.nan])
         result = s.str.get_dummies('|')