Skip to content

Commit efe57e4

Browse files
committed
PERF: Utilize mixed dtypes in df.count() with MultiIndexes
1 parent def58c9 commit efe57e4

File tree

3 files changed

+53
-9
lines changed

3 files changed

+53
-9
lines changed

doc/source/whatsnew/v0.16.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ Performance
6868
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
6969
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
7070
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
71+
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)
7172

7273
Bug Fixes
7374
~~~~~~~~~

pandas/core/frame.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4108,27 +4108,38 @@ def _count_level(self, level, axis=0, numeric_only=False):
41084108
else:
41094109
frame = self
41104110

4111-
if axis == 1:
4112-
frame = frame.T
4111+
count_axis = frame._get_axis(axis)
4112+
agg_axis = frame._get_agg_axis(axis)
41134113

4114-
if not isinstance(frame.index, MultiIndex):
4114+
if not isinstance(count_axis, MultiIndex):
41154115
raise TypeError("Can only count levels on hierarchical %s." %
41164116
self._get_axis_name(axis))
41174117

4118-
# python 2.5
4119-
mask = notnull(frame.values).view(np.uint8)
4118+
if frame._is_mixed_type:
4119+
# Since we have mixed types, calling notnull(frame.values) might
4120+
# upcast everything to object
4121+
mask = notnull(frame).values
4122+
else:
4123+
# But use the speedup when we have homogeneous dtypes
4124+
mask = notnull(frame.values)
4125+
4126+
if axis == 1:
4127+
# We're transposing the mask rather than frame to avoid potential
4128+
# upcasts to object, which induces a ~20x slowdown
4129+
mask = mask.T
41204130

41214131
if isinstance(level, compat.string_types):
4122-
level = self.index._get_level_number(level)
4132+
level = count_axis._get_level_number(level)
41234133

4124-
level_index = frame.index.levels[level]
4125-
labels = com._ensure_int64(frame.index.labels[level])
4134+
level_index = count_axis.levels[level]
4135+
labels = com._ensure_int64(count_axis.labels[level])
41264136
counts = lib.count_level_2d(mask, labels, len(level_index))
41274137

41284138
result = DataFrame(counts, index=level_index,
4129-
columns=frame.columns)
4139+
columns=agg_axis)
41304140

41314141
if axis == 1:
4142+
# Undo our earlier transpose
41324143
return result.T
41334144
else:
41344145
return result

vb_suite/frame_methods.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,38 @@ def f(K=100):
328328
frame_dropna_axis1_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup,
329329
start_date=datetime(2012,1,1))
330330

331+
## dropna multi
332+
dropna_setup = common_setup + """
333+
data = np.random.randn(10000, 1000)
334+
df = DataFrame(data)
335+
df.ix[50:1000,20:50] = np.nan
336+
df.ix[2000:3000] = np.nan
337+
df.ix[:,60:70] = np.nan
338+
df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
339+
df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
340+
"""
341+
frame_count_level_axis0_multi = Benchmark('df.count(axis=0, level=1)', dropna_setup,
342+
start_date=datetime(2012,1,1))
343+
344+
frame_count_level_axis1_multi = Benchmark('df.count(axis=1, level=1)', dropna_setup,
345+
start_date=datetime(2012,1,1))
346+
347+
# dropna on mixed dtypes
348+
dropna_mixed_setup = common_setup + """
349+
data = np.random.randn(10000, 1000)
350+
df = DataFrame(data)
351+
df.ix[50:1000,20:50] = np.nan
352+
df.ix[2000:3000] = np.nan
353+
df.ix[:,60:70] = np.nan
354+
df['foo'] = 'bar'
355+
df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
356+
df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
357+
"""
358+
frame_count_level_axis0_mixed_dtypes_multi = Benchmark('df.count(axis=0, level=1)', dropna_mixed_setup,
359+
start_date=datetime(2012,1,1))
360+
361+
frame_count_level_axis1_mixed_dtypes_multi = Benchmark('df.count(axis=1, level=1)', dropna_mixed_setup,
362+
start_date=datetime(2012,1,1))
331363

332364
#----------------------------------------------------------------------
333365
# apply

0 commit comments

Comments
 (0)