From efe57e4f8eff1973018eb765c4e599cdabe3f23d Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Wed, 24 Dec 2014 01:46:02 -0800 Subject: [PATCH] PERF: Utilize mixed dtypes in df.count() with MultiIndexes --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/frame.py | 29 ++++++++++++++++++++--------- vb_suite/frame_methods.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 9a3a6bf319810..9ffcfb70c29db 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -68,6 +68,7 @@ Performance - Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`) - Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`) - Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`) +- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`) Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c30a3035de4cb..8ee65949e6bc1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4108,27 +4108,38 @@ def _count_level(self, level, axis=0, numeric_only=False): else: frame = self - if axis == 1: - frame = frame.T + count_axis = frame._get_axis(axis) + agg_axis = frame._get_agg_axis(axis) - if not isinstance(frame.index, MultiIndex): + if not isinstance(count_axis, MultiIndex): raise TypeError("Can only count levels on hierarchical %s." % self._get_axis_name(axis)) - # python 2.5 - mask = notnull(frame.values).view(np.uint8) + if frame._is_mixed_type: + # Since we have mixed types, calling notnull(frame.values) might + # upcast everything to object + mask = notnull(frame).values + else: + # But use the speedup when we have homogeneous dtypes + mask = notnull(frame.values) + + if axis == 1: + # We're transposing the mask rather than frame to avoid potential + # upcasts to object, which induces a ~20x slowdown + mask = mask.T if isinstance(level, compat.string_types): - level = self.index._get_level_number(level) + level = count_axis._get_level_number(level) - level_index = frame.index.levels[level] - labels = com._ensure_int64(frame.index.labels[level]) + level_index = count_axis.levels[level] + labels = com._ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index)) result = DataFrame(counts, index=level_index, - columns=frame.columns) + columns=agg_axis) if axis == 1: + # Undo our earlier transpose return result.T else: return result diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 2fe2b6d76ec5c..334534ed466f2 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -328,6 +328,38 @@ def f(K=100): frame_dropna_axis1_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup, start_date=datetime(2012,1,1)) +## dropna multi +dropna_setup = common_setup + """ +data = np.random.randn(10000, 1000) +df = DataFrame(data) +df.ix[50:1000,20:50] = np.nan +df.ix[2000:3000] = np.nan +df.ix[:,60:70] = np.nan +df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x))) +df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x))) +""" +frame_count_level_axis0_multi = Benchmark('df.count(axis=0, level=1)', dropna_setup, + start_date=datetime(2012,1,1)) + +frame_count_level_axis1_multi = Benchmark('df.count(axis=1, level=1)', dropna_setup, + start_date=datetime(2012,1,1)) + +# dropna on mixed dtypes +dropna_mixed_setup = common_setup + """ +data = np.random.randn(10000, 1000) +df = DataFrame(data) +df.ix[50:1000,20:50] = np.nan +df.ix[2000:3000] = np.nan +df.ix[:,60:70] = np.nan +df['foo'] = 'bar' +df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x))) +df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x))) +""" +frame_count_level_axis0_mixed_dtypes_multi = Benchmark('df.count(axis=0, level=1)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) + +frame_count_level_axis1_mixed_dtypes_multi = Benchmark('df.count(axis=1, level=1)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) #---------------------------------------------------------------------- # apply