diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5a23951145cb4..4086021bc61a6 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3,8 +3,9 @@ from pandas.compat import zip from pandas.core.dtypes.generic import ABCSeries, ABCIndex -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( + ensure_object, is_bool_dtype, is_categorical_dtype, is_object_dtype, @@ -36,114 +37,26 @@ _shared_docs = dict() -def _get_array_list(arr, others): - """ - Auxiliary function for :func:`str_cat` - - Parameters - ---------- - arr : ndarray - The left-most ndarray of the concatenation - others : list, ndarray, Series - The rest of the content to concatenate. If list of list-likes, - all elements must be passable to ``np.asarray``. - - Returns - ------- - list - List of all necessary arrays - """ - from pandas.core.series import Series - - if len(others) and isinstance(com.values_from_object(others)[0], - (list, np.ndarray, Series)): - arrays = [arr] + list(others) - else: - arrays = [arr, others] - - return [np.asarray(x, dtype=object) for x in arrays] - - -def str_cat(arr, others=None, sep=None, na_rep=None): +def cat_core(list_of_columns, sep): """ Auxiliary function for :meth:`str.cat` - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not being passed then all values in the Series are - concatenated in a single string with a given `sep`. - Parameters ---------- - others : list-like, or list of list-likes, optional - List-likes (or a list of them) of the same length as calling object. - If None, returns str concatenating strings of the Series. - sep : string or None, default None - If None, concatenates without any separator. - na_rep : string or None, default None - If None, NA in the series are ignored. + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns Returns ------- - concat - ndarray containing concatenated results (if `others is not None`) - or str (if `others is None`) + nd.array + The concatenation of list_of_columns with sep """ - if sep is None: - sep = '' - - if others is not None: - arrays = _get_array_list(arr, others) - - n = _length_check(arrays) - masks = np.array([isna(x) for x in arrays]) - cats = None - - if na_rep is None: - na_mask = np.logical_or.reduce(masks, axis=0) - - result = np.empty(n, dtype=object) - np.putmask(result, na_mask, np.nan) - - notmask = ~na_mask - - tuples = zip(*[x[notmask] for x in arrays]) - cats = [sep.join(tup) for tup in tuples] - - result[notmask] = cats - else: - for i, x in enumerate(arrays): - x = np.where(masks[i], na_rep, x) - if cats is None: - cats = x - else: - cats = cats + sep + x - - result = cats - - return result - else: - arr = np.asarray(arr, dtype=object) - mask = isna(arr) - if na_rep is None and mask.any(): - if sep == '': - na_rep = '' - else: - return sep.join(arr[notna(arr)]) - return sep.join(np.where(mask, na_rep, arr)) - - -def _length_check(others): - n = None - for x in others: - try: - if n is None: - n = len(x) - elif len(x) != n: - raise ValueError('All arrays must be same length') - except TypeError: - raise ValueError('Must pass arrays containing strings to str_cat') - return n + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + return np.sum(list_with_sep, axis=0) def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -2283,6 +2196,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): if isinstance(others, compat.string_types): raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = '' if isinstance(self._orig, Index): data = Series(self._orig, index=self._orig) @@ -2291,9 +2206,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # concatenate Series/Index with itself if no "others" if others is None: - result = str_cat(data, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result, - use_codes=(not self._is_categorical)) + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) try: # turn anything in "others" into lists of Series @@ -2320,23 +2239,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "'outer'|'inner'|'right'`. The future default will " "be `join='left'`.", FutureWarning, stacklevel=2) + # if join is None, _get_series_list already force-aligned indexes + join = 'left' if join is None else join + # align if required - if join is not None: + if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns others = concat(others, axis=1, join=(join if join == 'inner' else 'outer'), - keys=range(len(others))) + keys=range(len(others)), copy=False) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series - # str_cat discards index - res = str_cat(data, others=others, sep=sep, na_rep=na_rep) + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_core([x[not_masked] for x in all_cols], + sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] + result = cat_core(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_core(all_cols, sep) if isinstance(self._orig, Index): - res = Index(res, name=self._orig.name) + result = Index(result, name=self._orig.name) else: # Series - res = Series(res, index=data.index, name=self._orig.name) - return res + result = Series(result, index=data.index, name=self._orig.name) + return result _shared_docs['str_split'] = (""" Split strings around given separator/delimiter. diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bd450cdcf8054..75b1bcb8b2938 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -97,53 +97,6 @@ def test_iter_object_try_string(self): assert i == 100 assert s == 'h' - def test_cat(self): - one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) - two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) - - # single array - result = strings.str_cat(one) - exp = 'aabbc' - assert result == exp - - result = strings.str_cat(one, na_rep='NA') - exp = 'aabbcNA' - assert result == exp - - result = strings.str_cat(one, na_rep='-') - exp = 'aabbc-' - assert result == exp - - result = strings.str_cat(one, sep='_', na_rep='NA') - exp = 'a_a_b_b_c_NA' - assert result == exp - - result = strings.str_cat(two, sep='-') - exp = 'a-b-d-foo' - assert result == exp - - # Multiple arrays - result = strings.str_cat(one, [two], na_rep='NA') - exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], - dtype=np.object_) - tm.assert_numpy_array_equal(result, exp) - - result = strings.str_cat(one, two) - exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) - tm.assert_almost_equal(result, exp) - - # error for incorrect lengths - rgx = 'All arrays must be same length' - three = Series(['1', '2', '3']) - - with tm.assert_raises_regex(ValueError, rgx): - strings.str_cat(one, three) - - # error for incorrect type - rgx = "Must pass arrays containing strings to str_cat" - with tm.assert_raises_regex(ValueError, rgx): - strings.str_cat(one, 'three') - @pytest.mark.parametrize('box', [Series, Index]) @pytest.mark.parametrize('other', [None, Series, Index]) def test_str_cat_name(self, box, other): @@ -414,6 +367,12 @@ def test_str_cat_align_mixed_inputs(self, join): with tm.assert_raises_regex(ValueError, rgx): s.str.cat([t, z], join=join) + def test_str_cat_raises(self): + # non-strings hiding behind object dtype + s = Series([1, 2, 3, 4], dtype='object') + with tm.assert_raises_regex(TypeError, "unsupported operand type.*"): + s.str.cat(s) + def test_str_cat_special_cases(self): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])