3
3
4
4
from pandas .compat import zip
5
5
from pandas .core .dtypes .generic import ABCSeries , ABCIndex
6
- from pandas .core .dtypes .missing import isna , notna
6
+ from pandas .core .dtypes .missing import isna
7
7
from pandas .core .dtypes .common import (
8
8
is_bool_dtype ,
9
9
is_categorical_dtype ,
36
36
_shared_docs = dict ()
37
37
38
38
39
- def _get_array_list (arr , others ):
40
- """
41
- Auxiliary function for :func:`str_cat`
42
-
43
- Parameters
44
- ----------
45
- arr : ndarray
46
- The left-most ndarray of the concatenation
47
- others : list, ndarray, Series
48
- The rest of the content to concatenate. If list of list-likes,
49
- all elements must be passable to ``np.asarray``.
50
-
51
- Returns
52
- -------
53
- list
54
- List of all necessary arrays
55
- """
56
- from pandas .core .series import Series
57
-
58
- if len (others ) and isinstance (com .values_from_object (others )[0 ],
59
- (list , np .ndarray , Series )):
60
- arrays = [arr ] + list (others )
61
- else :
62
- arrays = [arr , others ]
63
-
64
- return [np .asarray (x , dtype = object ) for x in arrays ]
65
-
66
-
67
- def str_cat (arr , others = None , sep = None , na_rep = None ):
68
- """
39
+ def str_cat_core (array , sep ):
40
+ '''
69
41
Auxiliary function for :meth:`str.cat`
70
42
71
- If `others` is specified, this function concatenates the Series/Index
72
- and elements of `others` element-wise.
73
- If `others` is not being passed then all values in the Series are
74
- concatenated in a single string with a given `sep`.
75
-
76
43
Parameters
77
44
----------
78
- others : list-like, or list of list-likes, optional
79
- List-likes (or a list of them) of the same length as calling object.
80
- If None, returns str concatenating strings of the Series.
81
- sep : string or None, default None
82
- If None, concatenates without any separator.
83
- na_rep : string or None, default None
84
- If None, NA in the series are ignored.
45
+ array : ndarray
46
+ Array containing the vectors to be concatenated. These vectors must be
47
+ of object type and may not contain any nulls!
48
+ sep : string
49
+ The separator string for concatenating the columns
85
50
86
51
Returns
87
52
-------
88
- concat
89
- ndarray containing concatenated results (if `others is not None`)
90
- or str (if `others is None`)
91
- """
92
- if sep is None :
93
- sep = ''
94
-
95
- if others is not None :
96
- arrays = _get_array_list (arr , others )
97
-
98
- n = _length_check (arrays )
99
- masks = np .array ([isna (x ) for x in arrays ])
100
- cats = None
101
-
102
- if na_rep is None :
103
- na_mask = np .logical_or .reduce (masks , axis = 0 )
104
-
105
- result = np .empty (n , dtype = object )
106
- np .putmask (result , na_mask , np .nan )
107
-
108
- notmask = ~ na_mask
109
-
110
- tuples = zip (* [x [notmask ] for x in arrays ])
111
- cats = [sep .join (tup ) for tup in tuples ]
112
-
113
- result [notmask ] = cats
114
- else :
115
- for i , x in enumerate (arrays ):
116
- x = np .where (masks [i ], na_rep , x )
117
- if cats is None :
118
- cats = x
119
- else :
120
- cats = cats + sep + x
121
-
122
- result = cats
123
-
124
- return result
53
+ concatenated
54
+ the vector of concatenated results
55
+ '''
56
+ if sep == '' :
57
+ return array .sum (axis = 1 )
125
58
else :
126
- arr = np .asarray (arr , dtype = object )
127
- mask = isna (arr )
128
- if na_rep is None and mask .any ():
129
- if sep == '' :
130
- na_rep = ''
131
- else :
132
- return sep .join (arr [notna (arr )])
133
- return sep .join (np .where (mask , na_rep , arr ))
134
-
135
-
136
- def _length_check (others ):
137
- n = None
138
- for x in others :
139
- try :
140
- if n is None :
141
- n = len (x )
142
- elif len (x ) != n :
143
- raise ValueError ('All arrays must be same length' )
144
- except TypeError :
145
- raise ValueError ('Must pass arrays containing strings to str_cat' )
146
- return n
59
+ tmp = np .full ((array .shape [0 ], 2 * array .shape [1 ] - 1 ),
60
+ fill_value = sep , dtype = 'object' )
61
+ tmp [:, ::2 ] = array
62
+ return tmp .sum (axis = 1 )
147
63
148
64
149
65
def _na_map (f , arr , na_result = np .nan , dtype = object ):
@@ -2172,6 +2088,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2172
2088
2173
2089
if isinstance (others , compat .string_types ):
2174
2090
raise ValueError ("Did you mean to supply a `sep` keyword?" )
2091
+ if sep is None :
2092
+ sep = ''
2175
2093
2176
2094
if isinstance (self ._orig , Index ):
2177
2095
data = Series (self ._orig , index = self ._orig )
@@ -2180,9 +2098,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2180
2098
2181
2099
# concatenate Series/Index with itself if no "others"
2182
2100
if others is None :
2183
- result = str_cat (data , others = others , sep = sep , na_rep = na_rep )
2184
- return self ._wrap_result (result ,
2185
- use_codes = (not self ._is_categorical ))
2101
+ if na_rep is None :
2102
+ data = data .dropna ()
2103
+ else :
2104
+ data = data .fillna (na_rep )
2105
+ return sep .join (data .values )
2186
2106
2187
2107
try :
2188
2108
# turn anything in "others" into lists of Series
@@ -2198,6 +2118,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2198
2118
'must all be of the same length as the '
2199
2119
'calling Series/Index.' )
2200
2120
2121
+ if any (not is_object_dtype (x )
2122
+ and not (is_categorical_dtype (x )
2123
+ and is_object_dtype (x .cat .categories ))
2124
+ for x in others ):
2125
+ raise TypeError ('All columns in others must contain only strings '
2126
+ '(or missing values)!' )
2127
+
2201
2128
if join is None and warn :
2202
2129
warnings .warn ("A future version of pandas will perform index "
2203
2130
"alignment when `others` is a Series/Index/"
@@ -2209,23 +2136,30 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2209
2136
"'outer'|'inner'|'right'`. The future default will "
2210
2137
"be `join='left'`." , FutureWarning , stacklevel = 2 )
2211
2138
2212
- # align if required
2213
- if join is not None :
2214
- # Need to add keys for uniqueness in case of duplicate columns
2215
- others = concat (others , axis = 1 ,
2216
- join = (join if join == 'inner' else 'outer' ),
2217
- keys = range (len (others )))
2218
- data , others = data .align (others , join = join )
2219
- others = [others [x ] for x in others ] # again list of Series
2139
+ # if join is None, _get_series_list already aligned indexes
2140
+ join = 'left' if join is None else join
2220
2141
2221
- # str_cat discards index
2222
- res = str_cat (data , others = others , sep = sep , na_rep = na_rep )
2142
+ # Need to add keys for uniqueness in case of duplicate columns
2143
+ others = concat (others , axis = 1 ,
2144
+ join = (join if join == 'inner' else 'outer' ),
2145
+ keys = range (len (others )))
2146
+ data , others = data .align (others , join = join )
2147
+ df = concat ([data , others ], axis = 1 ).astype ('object' )
2148
+
2149
+ # calculate in numpy using str_cat_core; result is 1-dim np.ndarray
2150
+ if na_rep is None :
2151
+ mask = df .isna ().values .any (axis = 1 )
2152
+ result = np .full (len (data ), fill_value = np .nan , dtype = 'object' )
2153
+ result [~ mask ] = str_cat_core (df .values [~ mask ], sep )
2154
+ else :
2155
+ df = df .fillna (na_rep )
2156
+ result = str_cat_core (df .values , sep )
2223
2157
2224
2158
if isinstance (self ._orig , Index ):
2225
- res = Index (res , name = self ._orig .name )
2159
+ result = Index (result , name = self ._orig .name )
2226
2160
else : # Series
2227
- res = Series (res , index = data .index , name = self ._orig .name )
2228
- return res
2161
+ result = Series (result , index = data .index , name = self ._orig .name )
2162
+ return result
2229
2163
2230
2164
_shared_docs ['str_split' ] = ("""
2231
2165
Split strings around given separator/delimiter.
0 commit comments