Skip to content

Commit fd56bb1

Browse files
committed
CLN: str.cat internals
1 parent 3745576 commit fd56bb1

File tree

3 files changed

+70
-171
lines changed

3 files changed

+70
-171
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ Other API Changes
544544
- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`)
545545
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
546546
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
547+
- :meth:`Series.str.cat` now also works for binary data in Python 3 (:issue:`22721`) and has a clearer error message in case of passing non-string columns (:issue:`22722`)
547548
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
548549

549550
.. _whatsnew_0240.deprecations:

pandas/core/strings.py

Lines changed: 51 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from pandas.compat import zip
55
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
6-
from pandas.core.dtypes.missing import isna, notna
6+
from pandas.core.dtypes.missing import isna
77
from pandas.core.dtypes.common import (
88
is_bool_dtype,
99
is_categorical_dtype,
@@ -36,114 +36,30 @@
3636
_shared_docs = dict()
3737

3838

39-
def _get_array_list(arr, others):
40-
"""
41-
Auxiliary function for :func:`str_cat`
42-
43-
Parameters
44-
----------
45-
arr : ndarray
46-
The left-most ndarray of the concatenation
47-
others : list, ndarray, Series
48-
The rest of the content to concatenate. If list of list-likes,
49-
all elements must be passable to ``np.asarray``.
50-
51-
Returns
52-
-------
53-
list
54-
List of all necessary arrays
55-
"""
56-
from pandas.core.series import Series
57-
58-
if len(others) and isinstance(com.values_from_object(others)[0],
59-
(list, np.ndarray, Series)):
60-
arrays = [arr] + list(others)
61-
else:
62-
arrays = [arr, others]
63-
64-
return [np.asarray(x, dtype=object) for x in arrays]
65-
66-
67-
def str_cat(arr, others=None, sep=None, na_rep=None):
68-
"""
39+
def str_cat_core(array, sep):
40+
'''
6941
Auxiliary function for :meth:`str.cat`
7042
71-
If `others` is specified, this function concatenates the Series/Index
72-
and elements of `others` element-wise.
73-
If `others` is not being passed then all values in the Series are
74-
concatenated in a single string with a given `sep`.
75-
7643
Parameters
7744
----------
78-
others : list-like, or list of list-likes, optional
79-
List-likes (or a list of them) of the same length as calling object.
80-
If None, returns str concatenating strings of the Series.
81-
sep : string or None, default None
82-
If None, concatenates without any separator.
83-
na_rep : string or None, default None
84-
If None, NA in the series are ignored.
45+
array : ndarray
46+
Array containing the vectors to be concatenated. These vectors must be
47+
of object type and may not contain any nulls!
48+
sep : string
49+
The separator string for concatenating the columns
8550
8651
Returns
8752
-------
88-
concat
89-
ndarray containing concatenated results (if `others is not None`)
90-
or str (if `others is None`)
91-
"""
92-
if sep is None:
93-
sep = ''
94-
95-
if others is not None:
96-
arrays = _get_array_list(arr, others)
97-
98-
n = _length_check(arrays)
99-
masks = np.array([isna(x) for x in arrays])
100-
cats = None
101-
102-
if na_rep is None:
103-
na_mask = np.logical_or.reduce(masks, axis=0)
104-
105-
result = np.empty(n, dtype=object)
106-
np.putmask(result, na_mask, np.nan)
107-
108-
notmask = ~na_mask
109-
110-
tuples = zip(*[x[notmask] for x in arrays])
111-
cats = [sep.join(tup) for tup in tuples]
112-
113-
result[notmask] = cats
114-
else:
115-
for i, x in enumerate(arrays):
116-
x = np.where(masks[i], na_rep, x)
117-
if cats is None:
118-
cats = x
119-
else:
120-
cats = cats + sep + x
121-
122-
result = cats
123-
124-
return result
53+
concatenated
54+
the vector of concatenated results
55+
'''
56+
if sep == '':
57+
return array.sum(axis=1)
12558
else:
126-
arr = np.asarray(arr, dtype=object)
127-
mask = isna(arr)
128-
if na_rep is None and mask.any():
129-
if sep == '':
130-
na_rep = ''
131-
else:
132-
return sep.join(arr[notna(arr)])
133-
return sep.join(np.where(mask, na_rep, arr))
134-
135-
136-
def _length_check(others):
137-
n = None
138-
for x in others:
139-
try:
140-
if n is None:
141-
n = len(x)
142-
elif len(x) != n:
143-
raise ValueError('All arrays must be same length')
144-
except TypeError:
145-
raise ValueError('Must pass arrays containing strings to str_cat')
146-
return n
59+
tmp = np.full((array.shape[0], 2 * array.shape[1] - 1),
60+
fill_value=sep, dtype='object')
61+
tmp[:, ::2] = array
62+
return tmp.sum(axis=1)
14763

14864

14965
def _na_map(f, arr, na_result=np.nan, dtype=object):
@@ -2172,6 +2088,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21722088

21732089
if isinstance(others, compat.string_types):
21742090
raise ValueError("Did you mean to supply a `sep` keyword?")
2091+
if sep is None:
2092+
sep = ''
21752093

21762094
if isinstance(self._orig, Index):
21772095
data = Series(self._orig, index=self._orig)
@@ -2180,9 +2098,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21802098

21812099
# concatenate Series/Index with itself if no "others"
21822100
if others is None:
2183-
result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2184-
return self._wrap_result(result,
2185-
use_codes=(not self._is_categorical))
2101+
if na_rep is None:
2102+
data = data.dropna()
2103+
else:
2104+
data = data.fillna(na_rep)
2105+
return sep.join(data.values)
21862106

21872107
try:
21882108
# turn anything in "others" into lists of Series
@@ -2198,6 +2118,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
21982118
'must all be of the same length as the '
21992119
'calling Series/Index.')
22002120

2121+
if any(not is_object_dtype(x)
2122+
and not (is_categorical_dtype(x)
2123+
and is_object_dtype(x.cat.categories))
2124+
for x in others):
2125+
raise TypeError('All columns in others must contain only strings '
2126+
'(or missing values)!')
2127+
22012128
if join is None and warn:
22022129
warnings.warn("A future version of pandas will perform index "
22032130
"alignment when `others` is a Series/Index/"
@@ -2209,23 +2136,30 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
22092136
"'outer'|'inner'|'right'`. The future default will "
22102137
"be `join='left'`.", FutureWarning, stacklevel=2)
22112138

2212-
# align if required
2213-
if join is not None:
2214-
# Need to add keys for uniqueness in case of duplicate columns
2215-
others = concat(others, axis=1,
2216-
join=(join if join == 'inner' else 'outer'),
2217-
keys=range(len(others)))
2218-
data, others = data.align(others, join=join)
2219-
others = [others[x] for x in others] # again list of Series
2139+
# if join is None, _get_series_list already aligned indexes
2140+
join = 'left' if join is None else join
22202141

2221-
# str_cat discards index
2222-
res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2142+
# Need to add keys for uniqueness in case of duplicate columns
2143+
others = concat(others, axis=1,
2144+
join=(join if join == 'inner' else 'outer'),
2145+
keys=range(len(others)))
2146+
data, others = data.align(others, join=join)
2147+
df = concat([data, others], axis=1).astype('object')
2148+
2149+
# calculate in numpy using str_cat_core; result is 1-dim np.ndarray
2150+
if na_rep is None:
2151+
mask = df.isna().values.any(axis=1)
2152+
result = np.full(len(data), fill_value=np.nan, dtype='object')
2153+
result[~mask] = str_cat_core(df.values[~mask], sep)
2154+
else:
2155+
df = df.fillna(na_rep)
2156+
result = str_cat_core(df.values, sep)
22232157

22242158
if isinstance(self._orig, Index):
2225-
res = Index(res, name=self._orig.name)
2159+
result = Index(result, name=self._orig.name)
22262160
else: # Series
2227-
res = Series(res, index=data.index, name=self._orig.name)
2228-
return res
2161+
result = Series(result, index=data.index, name=self._orig.name)
2162+
return result
22292163

22302164
_shared_docs['str_split'] = ("""
22312165
Split strings around given separator/delimiter.

pandas/tests/test_strings.py

Lines changed: 18 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
9797
assert i == 100
9898
assert s == 'h'
9999

100-
def test_cat(self):
101-
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
102-
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)
103-
104-
# single array
105-
result = strings.str_cat(one)
106-
exp = 'aabbc'
107-
assert result == exp
108-
109-
result = strings.str_cat(one, na_rep='NA')
110-
exp = 'aabbcNA'
111-
assert result == exp
112-
113-
result = strings.str_cat(one, na_rep='-')
114-
exp = 'aabbc-'
115-
assert result == exp
116-
117-
result = strings.str_cat(one, sep='_', na_rep='NA')
118-
exp = 'a_a_b_b_c_NA'
119-
assert result == exp
120-
121-
result = strings.str_cat(two, sep='-')
122-
exp = 'a-b-d-foo'
123-
assert result == exp
124-
125-
# Multiple arrays
126-
result = strings.str_cat(one, [two], na_rep='NA')
127-
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
128-
dtype=np.object_)
129-
tm.assert_numpy_array_equal(result, exp)
130-
131-
result = strings.str_cat(one, two)
132-
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
133-
tm.assert_almost_equal(result, exp)
134-
135-
# error for incorrect lengths
136-
rgx = 'All arrays must be same length'
137-
three = Series(['1', '2', '3'])
138-
139-
with tm.assert_raises_regex(ValueError, rgx):
140-
strings.str_cat(one, three)
141-
142-
# error for incorrect type
143-
rgx = "Must pass arrays containing strings to str_cat"
144-
with tm.assert_raises_regex(ValueError, rgx):
145-
strings.str_cat(one, 'three')
146-
147100
@pytest.mark.parametrize('box', [Series, Index])
148101
@pytest.mark.parametrize('other', [None, Series, Index])
149102
def test_str_cat_name(self, box, other):
@@ -242,6 +195,13 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target):
242195
result = s.str.cat(t)
243196
assert_series_or_index_equal(result, expected)
244197

198+
# errors for inputs not consisting of strings (also when categorical)
199+
rgx = 'All columns in others must contain only strings.*'
200+
201+
# data with non-object dtype
202+
with tm.assert_raises_regex(TypeError, rgx):
203+
s.str.cat(Series([1, 2, 3, 4]))
204+
245205
@pytest.mark.parametrize('box', [Series, Index])
246206
def test_str_cat_mixed_inputs(self, box):
247207
s = Index(['a', 'b', 'c', 'd'])
@@ -352,6 +312,13 @@ def test_str_cat_mixed_inputs(self, box):
352312
with tm.assert_raises_regex(TypeError, rgx):
353313
s.str.cat(1)
354314

315+
# errors for inputs not consisting of strings
316+
rgx = 'All columns in others must contain only strings.*'
317+
318+
# data with non-object dtype
319+
with tm.assert_raises_regex(TypeError, rgx):
320+
s.str.cat(Series([1, 2, 3, 4]))
321+
355322
@pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
356323
@pytest.mark.parametrize('box', [Series, Index])
357324
def test_str_cat_align_indexed(self, box, join):
@@ -3135,10 +3102,7 @@ def test_str_accessor_no_new_attributes(self):
31353102
def test_method_on_bytes(self):
31363103
lhs = Series(np.array(list('abc'), 'S1').astype(object))
31373104
rhs = Series(np.array(list('def'), 'S1').astype(object))
3138-
if compat.PY3:
3139-
pytest.raises(TypeError, lhs.str.cat, rhs)
3140-
else:
3141-
result = lhs.str.cat(rhs)
3142-
expected = Series(np.array(
3143-
['ad', 'be', 'cf'], 'S2').astype(object))
3144-
tm.assert_series_equal(result, expected)
3105+
3106+
result = lhs.str.cat(rhs, join='left')
3107+
expected = Series(np.array(['ad', 'be', 'cf'], 'S2').astype(object))
3108+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)