diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 61500bedcdcd4..9049d8de550d0 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -721,7 +721,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) -- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`) +- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 220e67c43e4be..444f149e70e34 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -82,7 +82,7 @@ _series_apply_whitelist = \ (_common_apply_whitelist - set(['boxplot'])) | \ - frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest']) + frozenset(['dtype', 'unique', 'nlargest', 'nsmallest']) _dataframe_apply_whitelist = \ _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) @@ -2583,6 +2583,108 @@ def nunique(self, dropna=True): index=self.grouper.result_index, name=self.name) + def value_counts(self, normalize=False, sort=True, ascending=False, + bins=None, dropna=True): + + from functools import partial + from pandas.tools.tile import cut + from pandas.tools.merge import _get_join_indexers + + if bins is not None and not np.iterable(bins): + # scalar bins cannot be done at top level + # in a backward compatible way + return self.apply(Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins) + + ids, _, _ = self.grouper.group_info + val = self.obj.get_values() + + # groupby removes null keys from groupings + mask = ids != -1 + ids, val = ids[mask], val[mask] + + if bins is None: + lab, lev = algos.factorize(val, sort=True) + else: + cat, bins = cut(val, bins, retbins=True) + # bins[:-1] for backward compat; + # o.w. cat.categories could be better + lab, lev, dropna = cat.codes, bins[:-1], False + + sorter = np.lexsort((lab, ids)) + ids, lab = ids[sorter], lab[sorter] + + # group boundries are where group ids change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + + # new values are where sorted labels change + inc = np.r_[True, lab[1:] != lab[:-1]] + inc[idx] = True # group boundries are also new values + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + + # num. of times each group should be repeated + rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) + + # multi-index components + labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + names = self.grouper.names + [self.name] + + if dropna: + mask = labels[-1] != -1 + if mask.all(): + dropna = False + else: + out, labels = out[mask], [label[mask] for label in labels] + + if normalize: + out = out.astype('float') + acc = rep(np.diff(np.r_[idx, len(ids)])) + out /= acc[mask] if dropna else acc + + if sort and bins is None: + cat = ids[inc][mask] if dropna else ids[inc] + sorter = np.lexsort((out if ascending else -out, cat)) + out, labels[-1] = out[sorter], labels[-1][sorter] + + if bins is None: + mi = MultiIndex(levels=levels, labels=labels, names=names, + verify_integrity=False) + + return Series(out, index=mi) + + # for compat. with algos.value_counts need to ensure every + # bin is present at every index level, null filled with zeros + diff = np.zeros(len(out), dtype='bool') + for lab in labels[:-1]: + diff |= np.r_[True, lab[1:] != lab[:-1]] + + ncat, nbin = diff.sum(), len(levels[-1]) + + left = [np.repeat(np.arange(ncat), nbin), + np.tile(np.arange(nbin), ncat)] + + right = [diff.cumsum() - 1, labels[-1]] + + _, idx = _get_join_indexers(left, right, sort=False, how='left') + out = np.where(idx != -1, out[idx], 0) + + if sort: + sorter = np.lexsort((out if ascending else -out, left[0])) + out, left[-1] = out[sorter], left[-1][sorter] + + # build the multi-index w/ full levels + labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + labels.append(left[-1]) + + mi = MultiIndex(levels=levels, labels=labels, names=names, + verify_integrity=False) + + return Series(out, index=mi) + def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index e51a13d3a296f..a8bbc372ebe25 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1650,6 +1650,57 @@ def check_nunique(df, keys): check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) + def test_series_groupby_value_counts(self): + from itertools import product + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + def check_value_counts(df, keys, bins): + for isort, normalize, sort, ascending, dropna \ + in product((False, True), repeat=5): + + kwargs = dict(normalize=normalize, sort=sort, + ascending=ascending, dropna=dropna, + bins=bins) + + gr = df.groupby(keys, sort=isort) + left = gr['3rd'].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr['3rd'].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ['3rd'] + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + assert_series_equal(left.sort_index(), right.sort_index()) + + def loop(df): + bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) + keys = '1st', '2nd', ('1st', '2nd') + for k, b in product(keys, bins): + check_value_counts(df, k, b) + + days = date_range('2015-08-24', periods=10) + + for n, m in product((100, 10000), (5, 20)): + frame = DataFrame({ + '1st':np.random.choice(list('abcd'), n), + '2nd':np.random.choice(days, n), + '3rd':np.random.randint(1, m + 1, n)}) + + loop(frame) + + frame.loc[1::11, '1st'] = nan + frame.loc[3::17, '2nd'] = nan + frame.loc[7::19, '3rd'] = nan + frame.loc[8::19, '3rd'] = nan + frame.loc[9::19, '3rd'] = nan + + loop(frame) + def test_mulitindex_passthru(self): # GH 7997 @@ -4944,7 +4995,6 @@ def test_groupby_whitelist(self): 'plot', 'hist', 'median', 'dtype', 'corr', 'cov', - 'value_counts', 'diff', 'unique', 'nlargest', 'nsmallest',