diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index d7de5a7ac5979..544ab27157a17 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -48,6 +48,7 @@ Performance - Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:). - Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`) - Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`) +- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`) Bug Fixes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7c7872cf7b6a5..c30a3035de4cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4094,11 +4094,11 @@ def count(self, axis=0, level=None, numeric_only=False): if len(frame._get_axis(axis)) == 0: result = Series(0, index=frame._get_agg_axis(axis)) else: - if axis == 1: - counts = notnull(frame.values).sum(1) - result = Series(counts, index=frame._get_agg_axis(axis)) - else: + if frame._is_mixed_type: result = notnull(frame).sum(axis=axis) + else: + counts = notnull(frame.values).sum(axis=axis) + result = Series(counts, index=frame._get_agg_axis(axis)) return result.astype('int64') diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 12ba042487ebe..2fe2b6d76ec5c 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -290,30 +290,43 @@ def f(K=100): start_date=datetime(2012,1,1)) ## dropna -setup = common_setup + """ +dropna_setup = common_setup + """ data = np.random.randn(10000, 1000) df = DataFrame(data) df.ix[50:1000,20:50] = np.nan df.ix[2000:3000] = np.nan df.ix[:,60:70] = np.nan """ -frame_dropna_axis0_any = Benchmark('df.dropna(how="any",axis=0)', setup, +frame_dropna_axis0_any = Benchmark('df.dropna(how="any",axis=0)', dropna_setup, start_date=datetime(2012,1,1)) -frame_dropna_axis0_all = Benchmark('df.dropna(how="all",axis=0)', setup, +frame_dropna_axis0_all = Benchmark('df.dropna(how="all",axis=0)', dropna_setup, start_date=datetime(2012,1,1)) -setup = common_setup + """ +frame_dropna_axis1_any = Benchmark('df.dropna(how="any",axis=1)', dropna_setup, + start_date=datetime(2012,1,1)) + +frame_dropna_axis1_all = Benchmark('df.dropna(how="all",axis=1)', dropna_setup, + start_date=datetime(2012,1,1)) + +# dropna on mixed dtypes +dropna_mixed_setup = common_setup + """ data = np.random.randn(10000, 1000) df = DataFrame(data) df.ix[50:1000,20:50] = np.nan df.ix[2000:3000] = np.nan df.ix[:,60:70] = np.nan +df['foo'] = 'bar' """ -frame_dropna_axis1_any = Benchmark('df.dropna(how="any",axis=1)', setup, - start_date=datetime(2012,1,1)) +frame_dropna_axis0_any_mixed_dtypes = Benchmark('df.dropna(how="any",axis=0)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) +frame_dropna_axis0_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=0)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) -frame_dropna_axis1_all = Benchmark('df.dropna(how="all",axis=1)', setup, - start_date=datetime(2012,1,1)) +frame_dropna_axis1_any_mixed_dtypes = Benchmark('df.dropna(how="any",axis=1)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) + +frame_dropna_axis1_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup, + start_date=datetime(2012,1,1)) #----------------------------------------------------------------------