Skip to content

PERF: Utilize mixed dtypes in df.count() with MultiIndexes #9163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Performance
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)

Bug Fixes
~~~~~~~~~
Expand Down
29 changes: 20 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4108,27 +4108,38 @@ def _count_level(self, level, axis=0, numeric_only=False):
else:
frame = self

if axis == 1:
frame = frame.T
count_axis = frame._get_axis(axis)
agg_axis = frame._get_agg_axis(axis)

if not isinstance(frame.index, MultiIndex):
if not isinstance(count_axis, MultiIndex):
raise TypeError("Can only count levels on hierarchical %s." %
self._get_axis_name(axis))

# python 2.5
mask = notnull(frame.values).view(np.uint8)
if frame._is_mixed_type:
# Since we have mixed types, calling notnull(frame.values) might
# upcast everything to object
mask = notnull(frame).values
else:
# But use the speedup when we have homogeneous dtypes
mask = notnull(frame.values)

if axis == 1:
# We're transposing the mask rather than frame to avoid potential
# upcasts to object, which induces a ~20x slowdown
mask = mask.T

if isinstance(level, compat.string_types):
level = self.index._get_level_number(level)
level = count_axis._get_level_number(level)

level_index = frame.index.levels[level]
labels = com._ensure_int64(frame.index.labels[level])
level_index = count_axis.levels[level]
labels = com._ensure_int64(count_axis.labels[level])
counts = lib.count_level_2d(mask, labels, len(level_index))

result = DataFrame(counts, index=level_index,
columns=frame.columns)
columns=agg_axis)

if axis == 1:
# Undo our earlier transpose
return result.T
else:
return result
Expand Down
32 changes: 32 additions & 0 deletions vb_suite/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,38 @@ def f(K=100):
frame_dropna_axis1_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup,
start_date=datetime(2012,1,1))

## dropna multi
dropna_setup = common_setup + """
data = np.random.randn(10000, 1000)
df = DataFrame(data)
df.ix[50:1000,20:50] = np.nan
df.ix[2000:3000] = np.nan
df.ix[:,60:70] = np.nan
df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
"""
frame_count_level_axis0_multi = Benchmark('df.count(axis=0, level=1)', dropna_setup,
start_date=datetime(2012,1,1))

frame_count_level_axis1_multi = Benchmark('df.count(axis=1, level=1)', dropna_setup,
start_date=datetime(2012,1,1))

# dropna on mixed dtypes
dropna_mixed_setup = common_setup + """
data = np.random.randn(10000, 1000)
df = DataFrame(data)
df.ix[50:1000,20:50] = np.nan
df.ix[2000:3000] = np.nan
df.ix[:,60:70] = np.nan
df['foo'] = 'bar'
df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
"""
frame_count_level_axis0_mixed_dtypes_multi = Benchmark('df.count(axis=0, level=1)', dropna_mixed_setup,
start_date=datetime(2012,1,1))

frame_count_level_axis1_mixed_dtypes_multi = Benchmark('df.count(axis=1, level=1)', dropna_mixed_setup,
start_date=datetime(2012,1,1))

#----------------------------------------------------------------------
# apply
Expand Down