diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 083242cd69b74..cf5a44442045b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -581,6 +581,7 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) +- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) .. _whatsnew_0230.docs: diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 6d80e6f0073eb..a535872ff7279 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: a[0] = b[0] b[0] = t return 0 + +cdef enum TiebreakEnumType: + TIEBREAK_AVERAGE + TIEBREAK_MIN, + TIEBREAK_MAX + TIEBREAK_FIRST + TIEBREAK_FIRST_DESCENDING + TIEBREAK_DENSE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5d17488963b1c..a418e54e4da9b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -31,14 +31,6 @@ cdef double nan = NaN cdef int64_t iNaT = get_nat() -cdef: - int TIEBREAK_AVERAGE = 0 - int TIEBREAK_MIN = 1 - int TIEBREAK_MAX = 2 - int TIEBREAK_FIRST = 3 - int TIEBREAK_FIRST_DESCENDING = 4 - int TIEBREAK_DENSE = 5 - tiebreakers = { 'average': TIEBREAK_AVERAGE, 'min': TIEBREAK_MIN, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 55de700c9af52..d75c3a71896e3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -16,8 +16,9 @@ from numpy cimport (ndarray, from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap -from algos import take_2d_axis1_float64_float64, groupsort_indexer +from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, + TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) +from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index a751fadaf48cf..b24444c422efa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -444,8 +444,173 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """Provides the rank of values within each group + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of {{c_type}} values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool + unused in this method but provided for call compatability with other + Cython transformations + ties_method : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean + False for ranks by high (1) to low (N) + pct : boolean + Compute percentage rank of data within each group + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[{{c_type}}] masked_vals + ndarray[uint8_t] mask + bint keep_na + {{c_type}} nan_fill_val + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = ( values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + {{if name=='int64'}} + mask = (masked_vals == {{nan_val}}).astype(np.uint8) + {{else}} + mask = np.isnan(masked_vals).astype(np.uint8) + {{endif}} + + if ascending ^ (na_option == 'top'): + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).max + {{else}} + nan_fill_val = np.inf + {{endif}} + order = (masked_vals, mask, labels) + else: + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).min + {{else}} + nan_fill_val = -np.inf + {{endif}} + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and masked_vals[_as[i]] == nan_fill_val: + grp_na_count += 1 + out[_as[i], 0] = nan + else: + # this implementation is inefficient because it will + # continue overwriting previously encountered dups + # i.e. if 5 duplicated values are encountered it will + # write to the result as follows (assumes avg tiebreaker): + # 1 + # .5 .5 + # .33 .33 .33 + # .25 .25 .25 .25 + # .2 .2 .2 .2 .2 + # + # could potentially be optimized to only write to the + # result once the last duplicate value is encountered + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is coming + # up. the conditional also needs to handle nan equality and the + # end of iteration + if (i == N - 1 or ( + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not + (mask[_as[i]] and mask[_as[i+1]]))): + dups = sum_ranks = 0 + val_start = i + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are moving + # to a new group. If so, keep track of the index where the new + # group occurs, so the tiebreaker calculations can decrement that + # from their position. fill in the size of each group encountered + # (used by pct calculations later). also be sure to reset any of + # the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + dups = sum_ranks = 0 + grp_na_count = 0 + val_start = i + 1 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endfor}} + #---------------------------------------------------------------------- # group_min, group_max #---------------------------------------------------------------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 01241db7c0c42..0363bcd02aa16 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -994,7 +994,7 @@ def _transform_should_cast(self, func_nm): return (self.size().fillna(0) > 0).any() and (func_nm not in _cython_cast_blacklist) - def _cython_transform(self, how, numeric_only=True): + def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1002,12 +1002,16 @@ def _cython_transform(self, how, numeric_only=True): continue try: - result, names = self.grouper.transform(obj.values, how) + result, names = self.grouper.transform(obj.values, how, + **kwargs) except NotImplementedError: continue except AssertionError as e: raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) + if self._transform_should_cast(how): + output[name] = self._try_cast(result, obj) + else: + output[name] = result if len(output) == 0: raise DataError('No numeric types to aggregate') @@ -1768,6 +1772,37 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, method='average', ascending=True, na_option='keep', + pct=False, axis=0): + """Provides the rank of values within each group + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + method : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Compute percentage rank of data within each group + + Returns + ----- + DataFrame with ranking of values within each group + """ + return self._cython_transform('rank', numeric_only=False, + ties_method=method, ascending=ascending, + na_option=na_option, pct=pct, axis=axis) + @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): @@ -2183,6 +2218,16 @@ def get_group_levels(self): 'cumsum': 'group_cumsum', 'cummin': 'group_cummin', 'cummax': 'group_cummax', + 'rank': { + 'name': 'group_rank', + 'f': lambda func, a, b, c, d, **kwargs: func( + a, b, c, d, + kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep') + ) + } } } @@ -2242,7 +2287,8 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1): + def _cython_operation(self, kind, values, how, axis, min_count=-1, + **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2314,10 +2360,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): else: raise - if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + if how == 'rank': + out_dtype = 'float' else: - out_dtype = 'object' + if is_numeric: + out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + else: + out_dtype = 'object' labels, _, _ = self.group_info @@ -2334,7 +2383,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike) + result, values, labels, func, is_numeric, is_datetimelike, + **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT @@ -2373,8 +2423,8 @@ def aggregate(self, values, how, axis=0, min_count=-1): return self._cython_operation('aggregate', values, how, axis, min_count=min_count) - def transform(self, values, how, axis=0): - return self._cython_operation('transform', values, how, axis) + def transform(self, values, how, axis=0, **kwargs): + return self._cython_operation('transform', values, how, axis, **kwargs) def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric, is_datetimelike, min_count=-1): @@ -2394,7 +2444,7 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, **kwargs): comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -2406,9 +2456,9 @@ def _transform(self, result, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, is_datetimelike) + comp_ids, is_datetimelike, **kwargs) else: - transform_func(result, values, comp_ids, is_datetimelike) + transform_func(result, values, comp_ids, is_datetimelike, **kwargs) return result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5172efe25d697..2db772ac54369 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1895,6 +1895,172 @@ def test_rank_apply(self): expected = expected.reindex(result.index) assert_series_equal(result, expected) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) + @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), + ]) + def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + ]) + def test_rank_args_missing(self, grps, vals, ties_method, ascending, + na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) + def test_rank_resets_each_group(self, pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + assert_frame_equal(result, exp_df) + + def test_rank_avg_even_vals(self): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) + @pytest.mark.parametrize("pct", [True, False]) + @pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] + ]) + def test_rank_object_raises(self, ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(TypeError, "not callable"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2})