From 6d02616548d8590c94636c9da1224e088dd38897 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 23 Jan 2017 15:31:41 -0500 Subject: [PATCH] PERF: DataFrame.groupby.nunique closes #15197 --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 23 +++++++++++++++++++++-- pandas/tests/groupby/test_groupby.py | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7e4fa44ea8ded..c738d8d9c3499 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -121,7 +121,7 @@ Other enhancements - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). -- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`). +- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`). - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3bbf248ece1d3..84858ef5cf8b3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -34,7 +34,9 @@ from pandas.types.cast import _possibly_downcast_to_dtype from pandas.types.missing import isnull, notnull, _maybe_fill -from pandas.core.common import _values_from_object, AbstractMethodError +from pandas.core.common import (_values_from_object, AbstractMethodError, + _default_index) + from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -4042,7 +4044,24 @@ def nunique(self, dropna=True): 4 ham 5 x 5 ham 5 y """ - return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna)) + + obj = self._selected_obj + + def groupby_series(obj, col=None): + return SeriesGroupBy(obj, + selection=col, + grouper=self.grouper).nunique(dropna=dropna) + + if isinstance(obj, Series): + results = groupby_series(obj) + else: + from pandas.tools.merge import concat + results = [groupby_series(obj[col], col) for col in obj.columns] + results = concat(results, axis=1) + + if not self.as_index: + results.index = _default_index(len(results)) + return results from pandas.tools.plotting import boxplot_frame_groupby # noqa diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7140eb5a6fd12..68689189a92b2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2,6 +2,7 @@ from __future__ import print_function import nose +from string import ascii_lowercase from datetime import datetime from numpy import nan @@ -1807,22 +1808,22 @@ def test_groupby_as_index_agg(self): assert_frame_equal(left, right) def test_series_groupby_nunique(self): - from itertools import product - from string import ascii_lowercase - def check_nunique(df, keys): - for sort, dropna in product((False, True), repeat=2): - gr = df.groupby(keys, sort=sort) + def check_nunique(df, keys, as_index=True): + for sort, dropna in cart_product((False, True), repeat=2): + gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) - gr = df.groupby(keys, sort=sort) + gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) - assert_series_equal(left, right) + assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10) - for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)): + for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): frame = DataFrame({ 'jim': np.random.choice( list(ascii_lowercase), n), @@ -1841,6 +1842,8 @@ def check_nunique(df, keys): check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) def test_series_groupby_value_counts(self): from itertools import product