CLN: str.cat internals

h-vetinari · h-vetinari · commit fd56bb1f2025 · 2018-09-15T18:57:21.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -544,6 +544,7 @@ Other API Changes
 - :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`)
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
+- :meth:`Series.str.cat` now also works for binary data in Python 3 (:issue:`22721`) and has a clearer error message in case of passing non-string columns (:issue:`22722`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
 
 .. _whatsnew_0240.deprecations:
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -3,7 +3,7 @@
 
 from pandas.compat import zip
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex
-from pandas.core.dtypes.missing import isna, notna
+from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_categorical_dtype,
@@ -36,114 +36,30 @@
 _shared_docs = dict()
 
 
-def _get_array_list(arr, others):
-    """
-    Auxiliary function for :func:`str_cat`
-
-    Parameters
-    ----------
-    arr : ndarray
-        The left-most ndarray of the concatenation
-    others : list, ndarray, Series
-        The rest of the content to concatenate. If list of list-likes,
-        all elements must be passable to ``np.asarray``.
-
-    Returns
-    -------
-    list
-        List of all necessary arrays
-    """
-    from pandas.core.series import Series
-
-    if len(others) and isinstance(com.values_from_object(others)[0],
-                                  (list, np.ndarray, Series)):
-        arrays = [arr] + list(others)
-    else:
-        arrays = [arr, others]
-
-    return [np.asarray(x, dtype=object) for x in arrays]
-
-
-def str_cat(arr, others=None, sep=None, na_rep=None):
-    """
+def str_cat_core(array, sep):
+    '''
     Auxiliary function for :meth:`str.cat`
 
-    If `others` is specified, this function concatenates the Series/Index
-    and elements of `others` element-wise.
-    If `others` is not being passed then all values in the Series are
-    concatenated in a single string with a given `sep`.
-
     Parameters
     ----------
-    others : list-like, or list of list-likes, optional
-        List-likes (or a list of them) of the same length as calling object.
-        If None, returns str concatenating strings of the Series.
-    sep : string or None, default None
-        If None, concatenates without any separator.
-    na_rep : string or None, default None
-        If None, NA in the series are ignored.
+    array : ndarray
+        Array containing the vectors to be concatenated. These vectors must be
+        of object type and may not contain any nulls!
+    sep : string
+        The separator string for concatenating the columns
 
     Returns
     -------
-    concat
-        ndarray containing concatenated results (if `others is not None`)
-        or str (if `others is None`)
-    """
-    if sep is None:
-        sep = ''
-
-    if others is not None:
-        arrays = _get_array_list(arr, others)
-
-        n = _length_check(arrays)
-        masks = np.array([isna(x) for x in arrays])
-        cats = None
-
-        if na_rep is None:
-            na_mask = np.logical_or.reduce(masks, axis=0)
-
-            result = np.empty(n, dtype=object)
-            np.putmask(result, na_mask, np.nan)
-
-            notmask = ~na_mask
-
-            tuples = zip(*[x[notmask] for x in arrays])
-            cats = [sep.join(tup) for tup in tuples]
-
-            result[notmask] = cats
-        else:
-            for i, x in enumerate(arrays):
-                x = np.where(masks[i], na_rep, x)
-                if cats is None:
-                    cats = x
-                else:
-                    cats = cats + sep + x
-
-            result = cats
-
-        return result
+    concatenated
+        the vector of concatenated results
+    '''
+    if sep == '':
+        return array.sum(axis=1)
     else:
-        arr = np.asarray(arr, dtype=object)
-        mask = isna(arr)
-        if na_rep is None and mask.any():
-            if sep == '':
-                na_rep = ''
-            else:
-                return sep.join(arr[notna(arr)])
-        return sep.join(np.where(mask, na_rep, arr))
-
-
-def _length_check(others):
-    n = None
-    for x in others:
-        try:
-            if n is None:
-                n = len(x)
-            elif len(x) != n:
-                raise ValueError('All arrays must be same length')
-        except TypeError:
-            raise ValueError('Must pass arrays containing strings to str_cat')
-    return n
+        tmp = np.full((array.shape[0], 2 * array.shape[1] - 1),
+                      fill_value=sep, dtype='object')
+        tmp[:, ::2] = array
+        return tmp.sum(axis=1)
 
 
 def _na_map(f, arr, na_result=np.nan, dtype=object):
@@ -2172,6 +2088,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
 
         if isinstance(others, compat.string_types):
             raise ValueError("Did you mean to supply a `sep` keyword?")
+        if sep is None:
+            sep = ''
 
         if isinstance(self._orig, Index):
             data = Series(self._orig, index=self._orig)
@@ -2180,9 +2098,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
 
         # concatenate Series/Index with itself if no "others"
         if others is None:
-            result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
-            return self._wrap_result(result,
-                                     use_codes=(not self._is_categorical))
+            if na_rep is None:
+                data = data.dropna()
+            else:
+                data = data.fillna(na_rep)
+            return sep.join(data.values)
 
         try:
             # turn anything in "others" into lists of Series
@@ -2198,6 +2118,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
                                  'must all be of the same length as the '
                                  'calling Series/Index.')
 
+        if any(not is_object_dtype(x)
+               and not (is_categorical_dtype(x)
+                        and is_object_dtype(x.cat.categories))
+               for x in others):
+            raise TypeError('All columns in others must contain only strings '
+                            '(or missing values)!')
+
         if join is None and warn:
             warnings.warn("A future version of pandas will perform index "
                           "alignment when `others` is a Series/Index/"
@@ -2209,23 +2136,30 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
                           "'outer'|'inner'|'right'`. The future default will "
                           "be `join='left'`.", FutureWarning, stacklevel=2)
 
-        # align if required
-        if join is not None:
-            # Need to add keys for uniqueness in case of duplicate columns
-            others = concat(others, axis=1,
-                            join=(join if join == 'inner' else 'outer'),
-                            keys=range(len(others)))
-            data, others = data.align(others, join=join)
-            others = [others[x] for x in others]  # again list of Series
+        # if join is None, _get_series_list already aligned indexes
+        join = 'left' if join is None else join
 
-        # str_cat discards index
-        res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
+        # Need to add keys for uniqueness in case of duplicate columns
+        others = concat(others, axis=1,
+                        join=(join if join == 'inner' else 'outer'),
+                        keys=range(len(others)))
+        data, others = data.align(others, join=join)
+        df = concat([data, others], axis=1).astype('object')
+
+        # calculate in numpy using str_cat_core; result is 1-dim np.ndarray
+        if na_rep is None:
+            mask = df.isna().values.any(axis=1)
+            result = np.full(len(data), fill_value=np.nan, dtype='object')
+            result[~mask] = str_cat_core(df.values[~mask], sep)
+        else:
+            df = df.fillna(na_rep)
+            result = str_cat_core(df.values, sep)
 
         if isinstance(self._orig, Index):
-            res = Index(res, name=self._orig.name)
+            result = Index(result, name=self._orig.name)
         else:  # Series
-            res = Series(res, index=data.index, name=self._orig.name)
-        return res
+            result = Series(result, index=data.index, name=self._orig.name)
+        return result
 
     _shared_docs['str_split'] = ("""
     Split strings around given separator/delimiter.
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
         assert i == 100
         assert s == 'h'
 
-    def test_cat(self):
-        one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
-        two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)
-
-        # single array
-        result = strings.str_cat(one)
-        exp = 'aabbc'
-        assert result == exp
-
-        result = strings.str_cat(one, na_rep='NA')
-        exp = 'aabbcNA'
-        assert result == exp
-
-        result = strings.str_cat(one, na_rep='-')
-        exp = 'aabbc-'
-        assert result == exp
-
-        result = strings.str_cat(one, sep='_', na_rep='NA')
-        exp = 'a_a_b_b_c_NA'
-        assert result == exp
-
-        result = strings.str_cat(two, sep='-')
-        exp = 'a-b-d-foo'
-        assert result == exp
-
-        # Multiple arrays
-        result = strings.str_cat(one, [two], na_rep='NA')
-        exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
-                       dtype=np.object_)
-        tm.assert_numpy_array_equal(result, exp)
-
-        result = strings.str_cat(one, two)
-        exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
-        tm.assert_almost_equal(result, exp)
-
-        # error for incorrect lengths
-        rgx = 'All arrays must be same length'
-        three = Series(['1', '2', '3'])
-
-        with tm.assert_raises_regex(ValueError, rgx):
-            strings.str_cat(one, three)
-
-        # error for incorrect type
-        rgx = "Must pass arrays containing strings to str_cat"
-        with tm.assert_raises_regex(ValueError, rgx):
-            strings.str_cat(one, 'three')
-
     @pytest.mark.parametrize('box', [Series, Index])
     @pytest.mark.parametrize('other', [None, Series, Index])
     def test_str_cat_name(self, box, other):
@@ -242,6 +195,13 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target):
             result = s.str.cat(t)
             assert_series_or_index_equal(result, expected)
 
+        # errors for inputs not consisting of strings (also when categorical)
+        rgx = 'All columns in others must contain only strings.*'
+
+        # data with non-object dtype
+        with tm.assert_raises_regex(TypeError, rgx):
+            s.str.cat(Series([1, 2, 3, 4]))
+
     @pytest.mark.parametrize('box', [Series, Index])
     def test_str_cat_mixed_inputs(self, box):
         s = Index(['a', 'b', 'c', 'd'])
@@ -352,6 +312,13 @@ def test_str_cat_mixed_inputs(self, box):
         with tm.assert_raises_regex(TypeError, rgx):
             s.str.cat(1)
 
+        # errors for inputs not consisting of strings
+        rgx = 'All columns in others must contain only strings.*'
+
+        # data with non-object dtype
+        with tm.assert_raises_regex(TypeError, rgx):
+            s.str.cat(Series([1, 2, 3, 4]))
+
     @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
     @pytest.mark.parametrize('box', [Series, Index])
     def test_str_cat_align_indexed(self, box, join):
@@ -3135,10 +3102,7 @@ def test_str_accessor_no_new_attributes(self):
     def test_method_on_bytes(self):
         lhs = Series(np.array(list('abc'), 'S1').astype(object))
         rhs = Series(np.array(list('def'), 'S1').astype(object))
-        if compat.PY3:
-            pytest.raises(TypeError, lhs.str.cat, rhs)
-        else:
-            result = lhs.str.cat(rhs)
-            expected = Series(np.array(
-                ['ad', 'be', 'cf'], 'S2').astype(object))
-            tm.assert_series_equal(result, expected)
+
+        result = lhs.str.cat(rhs, join='left')
+        expected = Series(np.array(['ad', 'be', 'cf'], 'S2').astype(object))
+        tm.assert_series_equal(result, expected)