Skip to content

Commit 016b320

Browse files
committed
ENH: better impl for na_map in string methods. close #2602
1 parent 81bf29f commit 016b320

File tree

4 files changed

+44
-23
lines changed

4 files changed

+44
-23
lines changed

pandas/core/strings.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,30 @@ def _length_check(others):
8686

8787
def _na_map(f, arr, na_result=np.nan):
8888
# should really _check_ for NA
89-
def g(x):
90-
try:
91-
return f(x)
92-
except (TypeError, AttributeError):
93-
return na_result
94-
return _map(g, arr)
89+
return _map(f, arr, na_mask=True, na_value=na_result)
9590

9691

97-
def _map(f, arr):
92+
def _map(f, arr, na_mask=False, na_value=np.nan):
9893
if not isinstance(arr, np.ndarray):
9994
arr = np.asarray(arr, dtype=object)
100-
return lib.map_infer(arr, f)
95+
if na_mask:
96+
mask = isnull(arr)
97+
try:
98+
result = lib.map_infer_mask(arr, f, mask.view(np.uint8))
99+
except (TypeError, AttributeError):
100+
def g(x):
101+
try:
102+
return f(x)
103+
except (TypeError, AttributeError):
104+
return na_value
105+
return _map(g, arr)
106+
if na_value is not np.nan:
107+
np.putmask(result, mask, na_value)
108+
if result.dtype == object:
109+
result = lib.maybe_convert_objects(result)
110+
return result
111+
else:
112+
return lib.map_infer(arr, f)
101113

102114

103115
def str_count(arr, pat, flags=0):

pandas/io/pytables.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def __repr__(self):
248248
# a table
249249
if _is_table_type(n):
250250
values.append(str(create_table(self, n)))
251-
251+
252252
# a group
253253
elif kind is None:
254254
values.append('unknown type')
@@ -400,7 +400,7 @@ def remove(self, key, where=None):
400400
if where is None:
401401
group = self.get_node(key)
402402
group._f_remove(recursive=True)
403-
403+
404404
# delete from the table
405405
else:
406406
if not _is_table_type(group):
@@ -962,7 +962,7 @@ def infer(self, table):
962962
def convert(self, values):
963963
""" set the values from this selection """
964964
self.values = Index(_maybe_convert(values[self.cname], self.kind))
965-
965+
966966
@property
967967
def attrs(self):
968968
return self.table._v_attrs
@@ -1362,7 +1362,7 @@ def infer_axes(self):
13621362
return a boolean indicating if we have a valid table or not """
13631363

13641364
table = self.table
1365-
if table is None:
1365+
if table is None:
13661366
return False
13671367

13681368
self.index_axes, self.values_axes = [ a.infer(self.table) for a in self.indexables if a.is_indexable ], [ a.infer(self.table) for a in self.indexables if not a.is_indexable ]
@@ -1412,13 +1412,13 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None):
14121412
indexer = len(self.non_index_axes)
14131413
exist_axis = existing_table.non_index_axes[indexer][1]
14141414
if append_axis != exist_axis:
1415-
1415+
14161416
# ahah! -> reindex
14171417
if sorted(append_axis) == sorted(exist_axis):
14181418
append_axis = exist_axis
14191419

14201420
self.non_index_axes.append((i,append_axis))
1421-
1421+
14221422
# set axis positions (based on the axes)
14231423
self.index_axes = [ index_axes_map[a].set_pos(j) for j, a in enumerate(axes) ]
14241424
j = len(self.index_axes)
@@ -1444,7 +1444,7 @@ def create_axes(self, axes, obj, validate = True, min_itemsize = None):
14441444

14451445
# a string column
14461446
if b.dtype.name == 'object':
1447-
1447+
14481448
# itemsize is the maximum length of a string (along any dimension)
14491449
itemsize = _itemsize_string_array(values)
14501450

@@ -1489,7 +1489,7 @@ def reindex(obj, axis, filt, ordered):
14891489
ordd = ordered & filt
14901490
ordd = sorted(ordered.get_indexer(ordd))
14911491
return obj.reindex_axis(ordered.take(ordd), axis = obj._get_axis_number(axis_name), copy = False)
1492-
1492+
14931493
# apply the selection filters (but keep in the same order)
14941494
if self.selection.filter:
14951495
for axis, filt in self.selection.filter:
@@ -1558,7 +1558,7 @@ class LegacyTable(Table):
15581558
15591559
"""
15601560
_indexables = [IndexCol(name = 'index', axis = 1, pos = 0),
1561-
IndexCol(name = 'column', axis = 2, pos = 1, index_kind = 'columns_kind'),
1561+
IndexCol(name = 'column', axis = 2, pos = 1, index_kind = 'columns_kind'),
15621562
DataCol( name = 'fields', cname = 'values', kind_attr = 'fields', pos = 2) ]
15631563
table_type = 'legacy'
15641564
ndim = 3
@@ -1569,7 +1569,7 @@ def write(self, **kwargs):
15691569
def read(self, where=None):
15701570
""" we have n indexable columns, with an arbitrary number of data axes """
15711571

1572-
1572+
15731573
if not self.read_axes(where): return None
15741574

15751575
factors = [ Categorical.from_array(a.values) for a in self.index_axes ]
@@ -1591,7 +1591,7 @@ def read(self, where=None):
15911591

15921592
# the data need to be sorted
15931593
sorted_values = c.take_data().take(sorter, axis=0)
1594-
1594+
15951595
take_labels = [ l.take(sorter) for l in labels ]
15961596
items = Index(c.values)
15971597
block = block2d_to_blocknd(sorted_values, items, tuple(N), take_labels)
@@ -1767,7 +1767,7 @@ def delete(self, where = None):
17671767
# final element
17681768
if groups[-1] != ln:
17691769
groups.append(ln)
1770-
1770+
17711771
# initial element
17721772
if groups[0] != 0:
17731773
groups.insert(0,0)
@@ -1893,7 +1893,7 @@ def create_table(parent, group, typ = None, **kwargs):
18931893

18941894
def _itemsize_string_array(arr):
18951895
""" return the maximum size of elements in a strnig array """
1896-
return max([ str_len(arr[v]).max() for v in range(arr.shape[0]) ])
1896+
return max([ str_len(arr[v].ravel()).max() for v in range(arr.shape[0]) ])
18971897

18981898
def _convert_index(index):
18991899
if isinstance(index, DatetimeIndex):
@@ -2289,4 +2289,3 @@ def f(values, freq=None, tz=None):
22892289
tz=tz)
22902290
return f
22912291
return klass
2292-

pandas/tests/test_strings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_count(self):
6464
self.assert_(isinstance(result, Series))
6565
tm.assert_almost_equal(result, exp)
6666

67-
#mixed
67+
# mixed
6868
mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
6969
rs = strings.str_count(mixed, 'a')
7070
xp = [1, NA, 0, NA, NA, 0, NA, NA, NA]

vb_suite/frame_methods.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,13 @@ def f(K=500):
113113

114114
frame_insert_500_columns = Benchmark('f()', setup,
115115
start_date=datetime(2011, 1, 1))
116+
117+
#----------------------------------------------------------------------
118+
# strings methods, #2602
119+
120+
setup = common_setup + """
121+
s = Series(['abcdefg', np.nan]*500000)
122+
"""
123+
124+
series_string_vector_slice = Benchmark('s.str[:5]', setup,
125+
start_date=datetime(2012, 8, 1))

0 commit comments

Comments
 (0)