Skip to content

Commit 091c73d

Browse files
committed
PERF: improves SeriesGroupBy.nunique performance
1 parent 54f02df commit 091c73d

File tree

3 files changed

+63
-3
lines changed

3 files changed

+63
-3
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,7 @@ Performance Improvements
679679
~~~~~~~~~~~~~~~~~~~~~~~~
680680
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
681681
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
682+
- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`)
682683

683684
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
684685
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

pandas/core/groupby.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,7 @@
8282

8383
_series_apply_whitelist = \
8484
(_common_apply_whitelist - set(['boxplot'])) | \
85-
frozenset(['dtype', 'value_counts', 'unique', 'nunique',
86-
'nlargest', 'nsmallest'])
85+
frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest'])
8786

8887
_dataframe_apply_whitelist = \
8988
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2558,6 +2557,32 @@ def true_and_notnull(x, *args, **kwargs):
25582557
filtered = self._apply_filter(indices, dropna)
25592558
return filtered
25602559

2560+
def nunique(self, dropna=True):
2561+
ids, _, _ = self.grouper.group_info
2562+
val = self.obj.get_values()
2563+
2564+
sorter = np.lexsort((val, ids))
2565+
ids, val = ids[sorter], val[sorter]
2566+
2567+
# group boundries are where group ids change
2568+
# unique observations are where sorted values change
2569+
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
2570+
inc = np.r_[1, val[1:] != val[:-1]]
2571+
2572+
# 1st item of each group is a new unique observation
2573+
mask = isnull(val)
2574+
if dropna:
2575+
inc[idx] = 1
2576+
inc[mask] = 0
2577+
else:
2578+
inc[mask & np.r_[False, mask[:-1]]] = 0
2579+
inc[idx] = 1
2580+
2581+
out = np.add.reduceat(inc, idx)
2582+
return Series(out if ids[0] != -1 else out[1:],
2583+
index=self.grouper.result_index,
2584+
name=self.name)
2585+
25612586
def _apply_to_column_groupbys(self, func):
25622587
""" return a pass thru """
25632588
return func(self)

pandas/tests/test_groupby.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,40 @@ def test_groupby_as_index_agg(self):
16171617

16181618
assert_frame_equal(left, right)
16191619

1620+
def test_series_groupby_nunique(self):
1621+
from itertools import product
1622+
from string import ascii_lowercase
1623+
1624+
def check_nunique(df, keys):
1625+
for sort, dropna in product((False, True), repeat=2):
1626+
gr = df.groupby(keys, sort=sort)
1627+
left = gr['julie'].nunique(dropna=dropna)
1628+
1629+
gr = df.groupby(keys, sort=sort)
1630+
right = gr['julie'].apply(Series.nunique, dropna=dropna)
1631+
1632+
assert_series_equal(left, right)
1633+
1634+
days = date_range('2015-08-23', periods=10)
1635+
1636+
for n, m in product(10**np.arange(2, 6), (10, 100, 1000)):
1637+
frame = DataFrame({
1638+
'jim':np.random.choice(list(ascii_lowercase), n),
1639+
'joe':np.random.choice(days, n),
1640+
'julie':np.random.randint(0, m, n)})
1641+
1642+
check_nunique(frame, ['jim'])
1643+
check_nunique(frame, ['jim', 'joe'])
1644+
1645+
frame.loc[1::17, 'jim'] = None
1646+
frame.loc[3::37, 'joe'] = None
1647+
frame.loc[7::19, 'julie'] = None
1648+
frame.loc[8::19, 'julie'] = None
1649+
frame.loc[9::19, 'julie'] = None
1650+
1651+
check_nunique(frame, ['jim'])
1652+
check_nunique(frame, ['jim', 'joe'])
1653+
16201654
def test_mulitindex_passthru(self):
16211655

16221656
# GH 7997
@@ -4913,7 +4947,7 @@ def test_groupby_whitelist(self):
49134947
'corr', 'cov',
49144948
'value_counts',
49154949
'diff',
4916-
'unique', 'nunique',
4950+
'unique',
49174951
'nlargest', 'nsmallest',
49184952
])
49194953

0 commit comments

Comments
 (0)