From efe57e4f8eff1973018eb765c4e599cdabe3f23d Mon Sep 17 00:00:00 2001
From: Chris Whelan <topherwhelan@gmail.com>
Date: Wed, 24 Dec 2014 01:46:02 -0800
Subject: [PATCH] PERF: Utilize mixed dtypes in df.count() with MultiIndexes

---
 doc/source/whatsnew/v0.16.0.txt |  1 +
 pandas/core/frame.py            | 29 ++++++++++++++++++++---------
 vb_suite/frame_methods.py       | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index 9a3a6bf319810..9ffcfb70c29db 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -68,6 +68,7 @@ Performance
 - Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
 - Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
 - Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
+- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument  (:issue:`9163`)
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c30a3035de4cb..8ee65949e6bc1 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4108,27 +4108,38 @@ def _count_level(self, level, axis=0, numeric_only=False):
         else:
             frame = self
 
-        if axis == 1:
-            frame = frame.T
+        count_axis = frame._get_axis(axis)
+        agg_axis = frame._get_agg_axis(axis)
 
-        if not isinstance(frame.index, MultiIndex):
+        if not isinstance(count_axis, MultiIndex):
             raise TypeError("Can only count levels on hierarchical %s." %
                             self._get_axis_name(axis))
 
-        # python 2.5
-        mask = notnull(frame.values).view(np.uint8)
+        if frame._is_mixed_type:
+            # Since we have mixed types, calling notnull(frame.values) might
+            # upcast everything to object
+            mask = notnull(frame).values
+        else:
+            # But use the speedup when we have homogeneous dtypes
+            mask = notnull(frame.values)
+
+        if axis == 1:
+            # We're transposing the mask rather than frame to avoid potential
+            # upcasts to object, which induces a ~20x slowdown
+            mask = mask.T
 
         if isinstance(level, compat.string_types):
-            level = self.index._get_level_number(level)
+            level = count_axis._get_level_number(level)
 
-        level_index = frame.index.levels[level]
-        labels = com._ensure_int64(frame.index.labels[level])
+        level_index = count_axis.levels[level]
+        labels = com._ensure_int64(count_axis.labels[level])
         counts = lib.count_level_2d(mask, labels, len(level_index))
 
         result = DataFrame(counts, index=level_index,
-                           columns=frame.columns)
+                           columns=agg_axis)
 
         if axis == 1:
+            # Undo our earlier transpose
             return result.T
         else:
             return result
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
index 2fe2b6d76ec5c..334534ed466f2 100644
--- a/vb_suite/frame_methods.py
+++ b/vb_suite/frame_methods.py
@@ -328,6 +328,38 @@ def f(K=100):
 frame_dropna_axis1_all_mixed_dtypes  = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup,
                                                  start_date=datetime(2012,1,1))
 
+## dropna multi
+dropna_setup = common_setup + """
+data = np.random.randn(10000, 1000)
+df = DataFrame(data)
+df.ix[50:1000,20:50] = np.nan
+df.ix[2000:3000] = np.nan
+df.ix[:,60:70] = np.nan
+df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
+df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
+"""
+frame_count_level_axis0_multi = Benchmark('df.count(axis=0, level=1)', dropna_setup,
+                                          start_date=datetime(2012,1,1))
+
+frame_count_level_axis1_multi = Benchmark('df.count(axis=1, level=1)', dropna_setup,
+                                          start_date=datetime(2012,1,1))
+
+# dropna on mixed dtypes
+dropna_mixed_setup = common_setup + """
+data = np.random.randn(10000, 1000)
+df = DataFrame(data)
+df.ix[50:1000,20:50] = np.nan
+df.ix[2000:3000] = np.nan
+df.ix[:,60:70] = np.nan
+df['foo'] = 'bar'
+df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x)))
+df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x)))
+"""
+frame_count_level_axis0_mixed_dtypes_multi  = Benchmark('df.count(axis=0, level=1)', dropna_mixed_setup,
+                                                        start_date=datetime(2012,1,1))
+
+frame_count_level_axis1_mixed_dtypes_multi  = Benchmark('df.count(axis=1, level=1)', dropna_mixed_setup,
+                                                        start_date=datetime(2012,1,1))
 
 #----------------------------------------------------------------------
 # apply