Skip to content

PERF: DataFrame.groupby.nunique #15201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ Other enhancements
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)

- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`).
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`).

- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
Expand Down
23 changes: 21 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
from pandas.types.cast import _possibly_downcast_to_dtype
from pandas.types.missing import isnull, notnull, _maybe_fill

from pandas.core.common import _values_from_object, AbstractMethodError
from pandas.core.common import (_values_from_object, AbstractMethodError,
_default_index)

from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
DataError, SpecificationError)
from pandas.core.categorical import Categorical
Expand Down Expand Up @@ -4042,7 +4044,24 @@ def nunique(self, dropna=True):
4 ham 5 x
5 ham 5 y
"""
return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))

obj = self._selected_obj

def groupby_series(obj, col=None):
return SeriesGroupBy(obj,
selection=col,
grouper=self.grouper).nunique(dropna=dropna)

if isinstance(obj, Series):
results = groupby_series(obj)
else:
from pandas.tools.merge import concat
results = [groupby_series(obj[col], col) for col in obj.columns]
results = concat(results, axis=1)

if not self.as_index:
results.index = _default_index(len(results))
return results


from pandas.tools.plotting import boxplot_frame_groupby # noqa
Expand Down
19 changes: 11 additions & 8 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
import nose

from string import ascii_lowercase
from datetime import datetime
from numpy import nan

Expand Down Expand Up @@ -1807,22 +1808,22 @@ def test_groupby_as_index_agg(self):
assert_frame_equal(left, right)

def test_series_groupby_nunique(self):
from itertools import product
from string import ascii_lowercase

def check_nunique(df, keys):
for sort, dropna in product((False, True), repeat=2):
gr = df.groupby(keys, sort=sort)
def check_nunique(df, keys, as_index=True):
for sort, dropna in cart_product((False, True), repeat=2):
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr['julie'].nunique(dropna=dropna)

gr = df.groupby(keys, sort=sort)
gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr['julie'].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)

assert_series_equal(left, right)
assert_series_equal(left, right, check_names=False)

days = date_range('2015-08-23', periods=10)

for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)):
for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
frame = DataFrame({
'jim': np.random.choice(
list(ascii_lowercase), n),
Expand All @@ -1841,6 +1842,8 @@ def check_nunique(df, keys):

check_nunique(frame, ['jim'])
check_nunique(frame, ['jim', 'joe'])
check_nunique(frame, ['jim'], as_index=False)
check_nunique(frame, ['jim', 'joe'], as_index=False)

def test_series_groupby_value_counts(self):
from itertools import product
Expand Down