From 396f1b69afd8772f7810243406f8e01437bffa61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 29 Jan 2018 10:22:45 -0500 Subject: [PATCH 01/35] Initial working rank with no tiebreaker --- pandas/_libs/groupby_helper.pxi.in | 33 ++++++++++++++++++++++++++++++ pandas/core/groupby.py | 7 +++++++ 2 files changed, 40 insertions(+) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index a751fadaf48cf..58a8b442e8c2a 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -444,6 +444,39 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K + int64_t lab, idx, counter=1 + ndarray[int64_t] _as + + N, K = ( values).shape + + _as = np.lexsort((values[:, 0], labels)) + + with nogil: + for i in range(N): + idx = _as[i] + lab = labels[idx] + if i > 0 and lab == labels[_as[i-1]]: + counter += 1 + else: + counter = 1 + if lab < 0: + continue + + for j in range(K): + out[idx, j] = counter + {{endfor}} #---------------------------------------------------------------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 01241db7c0c42..e2fa32c0e5139 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1768,6 +1768,12 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, axis=0, *args, **kwargs): + """Rank within each group""" + return self._cython_transform('rank', **kwargs) + @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): @@ -2183,6 +2189,7 @@ def get_group_levels(self): 'cumsum': 'group_cumsum', 'cummin': 'group_cummin', 'cummax': 'group_cummax', + 'rank': 'group_rank', } } From c2c217731d7bd4a257697e6b5d6b12cb7905fa5f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 31 Jan 2018 11:03:05 -0800 Subject: [PATCH 02/35] Allowed kwargs to pass through to Cython func --- pandas/_libs/groupby_helper.pxi.in | 2 +- pandas/core/groupby.py | 25 +++++++++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 58a8b442e8c2a..0d28a9e7b9731 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -450,7 +450,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - bint is_datetimelike): + bint is_datetimelike, **kwargs): """ Only transforms on axis=0 """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e2fa32c0e5139..428c5b4657b88 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -994,7 +994,7 @@ def _transform_should_cast(self, func_nm): return (self.size().fillna(0) > 0).any() and (func_nm not in _cython_cast_blacklist) - def _cython_transform(self, how, numeric_only=True): + def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1002,7 +1002,7 @@ def _cython_transform(self, how, numeric_only=True): continue try: - result, names = self.grouper.transform(obj.values, how) + result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue except AssertionError as e: @@ -1770,9 +1770,12 @@ def cumcount(self, ascending=True): @Substitution(name='groupby') @Appender(_doc_template) - def rank(self, axis=0, *args, **kwargs): + def rank(self, ties_method='average', ascending=True, na_option='keep', + pct=False, axis=0): """Rank within each group""" - return self._cython_transform('rank', **kwargs) + return self._cython_transform('rank', ties_method=ties_method, + ascending=ascending, na_option=na_option, + pct=pct, axis=axis) @Substitution(name='groupby') @Appender(_doc_template) @@ -2249,7 +2252,8 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1): + def _cython_operation(self, kind, values, how, axis, min_count=-1, + **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2341,7 +2345,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike) + result, values, labels, func, is_numeric, is_datetimelike, + **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT @@ -2380,8 +2385,8 @@ def aggregate(self, values, how, axis=0, min_count=-1): return self._cython_operation('aggregate', values, how, axis, min_count=min_count) - def transform(self, values, how, axis=0): - return self._cython_operation('transform', values, how, axis) + def transform(self, values, how, axis=0, **kwargs): + return self._cython_operation('transform', values, how, axis, **kwargs) def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric, is_datetimelike, min_count=-1): @@ -2401,7 +2406,7 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, **kwargs): comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -2415,7 +2420,7 @@ def _transform(self, result, values, comp_ids, transform_func, transform_func(result[:, :, i], values, comp_ids, is_datetimelike) else: - transform_func(result, values, comp_ids, is_datetimelike) + transform_func(result, values, comp_ids, is_datetimelike, **kwargs) return result From 529503f86a7508600215287b292132e0613b2777 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 31 Jan 2018 14:04:57 -0800 Subject: [PATCH 03/35] Comprehensive tests for all groupby rank args --- pandas/tests/groupby/test_groupby.py | 162 +++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5172efe25d697..ba9f3da022150 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1895,6 +1895,168 @@ def test_rank_apply(self): expected = expected.reindex(result.index) assert_series_equal(result, expected) + @pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], ['bar', 'bar', 'foo', 'bar', 'baz']]) + @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, DataFrame( + [2., 2., 5., 2., 4.], columns=['val'])), + ('average', True, True, DataFrame( + [0.4, 0.4, 1.0, 0.4, 0.8], columns=['val'])), + ('average', False, False, DataFrame( + [4., 4., 1., 4., 2.], columns=['val'])), + ('average', False, True, DataFrame( + [.8, .8, .2, .8, .4], columns=['val'])), + ('min', True, False, DataFrame( + [1., 1., 5., 1., 4.], columns=['val'])), + ('min', True, True, DataFrame( + [0.2, 0.2, 1.0, 0.2, 0.8], columns=['val'])), + ('min', False, False, DataFrame( + [3., 3., 1., 3., 2.], columns=['val'])), + ('min', False, True, DataFrame( + [.6, .6, .2, .6, .4], columns=['val'])), + ('max', True, False, DataFrame( + [3., 3., 5., 3., 4.], columns=['val'])), + ('max', True, True, DataFrame( + [0.6, 0.6, 1.0, 0.6, 0.8], columns=['val'])), + ('max', False, False, DataFrame( + [5., 5., 1., 5., 2.], columns=['val'])), + ('max', False, True, DataFrame( + [1., 1., .2, 1., .4], columns=['val'])), + ('first', True, False, DataFrame( + [1., 2., 5., 3., 4.], columns=['val'])), + ('first', True, True, DataFrame( + [0.2, 0.4, 1.0, 0.6, 0.8], columns=['val'])), + ('first', False, False, DataFrame( + [3., 4., 1., 5., 2.], columns=['val'])), + ('first', False, True, DataFrame( + [.6, .8, .2, 1., .4], columns=['val'])), + ('dense', True, False, DataFrame( + [1., 1., 3., 1., 2.], columns=['val'])), + ('dense', True, True, DataFrame( + [0.2, 0.2, 0.6, 0.2, 0.4], columns=['val'])), + ('dense', False, False, DataFrame( + [3., 3., 1., 3., 2.], columns=['val'])), + ('dense', False, True, DataFrame( + [.6, .6, .2, .6, .4], columns=['val'])), + ]) + def test_rank_args(self, vals, ties_method, ascending, pct, exp): + if ties_method == 'first' and vals[0] == 'bar': + pytest.xfail("See GH 19482") + df = DataFrame({'key': ['foo']*5, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, ascending=ascending, + pct=pct) + + assert_frame_equal(result, exp) + + @pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan] # objects + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, DataFrame( + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan], columns=['val'])), + ('average', True, 'keep', True, DataFrame( + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + columns=['val'])), + ('average', False, 'keep', False, DataFrame( + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan], columns=['val'])), + ('average', False, 'keep', True, DataFrame( + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], columns=['val'])), + ('min', True, 'keep', False, DataFrame( + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan], columns=['val'])), + ('min', True, 'keep', True, DataFrame( + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan], + columns=['val'])), + ('min', False, 'keep', False, DataFrame( + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan], columns=['val'])), + ('min', False, 'keep', True, DataFrame( + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan], columns=['val'])), + ('max', True, 'keep', False, DataFrame( + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan], columns=['val'])), + ('max', True, 'keep', True, DataFrame( + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + columns=['val'])), + ('max', False, 'keep', False, DataFrame( + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan], columns=['val'])), + ('max', False, 'keep', True, DataFrame( + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan], columns=['val'])), + ('first', True, 'keep', False, DataFrame( + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan], columns=['val'])), + ('first', True, 'keep', True, DataFrame( + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + columns=['val'])), + ('first', False, 'keep', False, DataFrame( + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan], columns=['val'])), + ('first', False, 'keep', True, DataFrame( + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan], columns=['val'])), + ('dense', True, 'keep', False, DataFrame( + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan], columns=['val'])), + ('dense', True, 'keep', True, DataFrame( + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan], + columns=['val'])), + ('dense', False, 'keep', False, DataFrame( + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan], columns=['val'])), + ('dense', False, 'keep', True, DataFrame( + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan], columns=['val'])), + ('average', True, 'no_na', False, DataFrame( + [2., 2., 7., 5., 2., 4., 7., 7.], columns=['val'])), + ('average', True, 'no_na', True, DataFrame( + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + columns=['val'])), + ('average', False, 'no_na', False, DataFrame( + [4., 4., 7.0, 1., 4., 2., 7.0, 7.0], columns=['val'])), + ('average', False, 'no_na', True, DataFrame( + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], columns=['val'])), + ('min', True, 'no_na', False, DataFrame( + [1., 1., 6., 5., 1., 4., 6., 6.], columns=['val'])), + ('min', True, 'no_na', True, DataFrame( + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + columns=['val'])), + ('min', False, 'no_na', False, DataFrame( + [3., 3., 6., 1., 3., 2., 6., 6.], columns=['val'])), + ('min', False, 'no_na', True, DataFrame( + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + columns=['val'])), + ('max', True, 'no_na', False, DataFrame( + [3., 3., 8., 5., 3., 4., 8., 8.], columns=['val'])), + ('max', True, 'no_na', True, DataFrame( + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.], columns=['val'])), + ('max', False, 'no_na', False, DataFrame( + [5., 5., 8., 1., 5., 2., 8., 8.], columns=['val'])), + ('max', False, 'no_na', True, DataFrame( + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.], columns=['val'])), + ('first', True, 'no_na', False, DataFrame( + [1., 2., 6., 5., 3., 4., 7., 8.], columns=['val'])), + ('first', True, 'no_na', True, DataFrame( + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.], + columns=['val'])), + ('first', False, 'no_na', False, DataFrame( + [3., 4., 6., 1., 5., 2., 7., 8.], columns=['val'])), + ('first', False, 'no_na', True, DataFrame( + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.], + columns=['val'])), + ('dense', True, 'no_na', False, DataFrame( + [1., 1., 4., 3., 1., 2., 4., 4.], columns=['val'])), + ('dense', True, 'no_na', True, DataFrame( + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5], + columns=['val'])), + ('dense', False, 'no_na', False, DataFrame( + [3., 3., 4., 1., 3., 2., 4., 4.], columns=['val'])), + ('dense', False, 'no_na', True, DataFrame( + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5], + columns=['val'])), + ]) + def test_rank_args_missing(self, vals, ties_method, ascending, na_option, + pct, exp): + if ties_method == 'first' and vals[0] == 'bar': + pytest.xfail("See GH 19482") + + df = DataFrame({'key': ['foo']*8, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, ascending=ascending, + na_option=na_option, pct=pct) + + assert_frame_equal(result, exp) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From c7faa3b2f5b197ca0399a157e79daa973a0dff71 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 10:16:58 -0800 Subject: [PATCH 04/35] Working avg tiebreak with nan handling --- pandas/_libs/algos.pxd | 8 ++++++++ pandas/_libs/groupby.pyx | 6 ++++-- pandas/_libs/groupby_helper.pxi.in | 33 ++++++++++++++++++------------ pandas/core/groupby.py | 4 ++-- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 6d80e6f0073eb..3834a68b67075 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: a[0] = b[0] b[0] = t return 0 + +cdef: + int TIEBREAK_AVERAGE = 0 + int TIEBREAK_MIN = 1 + int TIEBREAK_MAX = 2 + int TIEBREAK_FIRST = 3 + int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 55de700c9af52..af32a617c3a63 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -13,11 +13,13 @@ from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t) +from libc.math cimport isnan from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap -from algos import take_2d_axis1_float64_float64, groupsort_indexer +from algos cimport (swap, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, + TIEBREAK_FIRST, TIEBREAK_DENSE) +from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 0d28a9e7b9731..4e51b22536edd 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -444,7 +444,6 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.boundscheck(False) @cython.wraparound(False) def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -455,27 +454,35 @@ def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, Only transforms on axis=0 """ cdef: + int tiebreak Py_ssize_t i, j, N, K - int64_t lab, idx, counter=1 + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 ndarray[int64_t] _as + tiebreak = tiebreakers[kwargs['ties_method']] N, K = ( values).shape _as = np.lexsort((values[:, 0], labels)) with nogil: for i in range(N): - idx = _as[i] - lab = labels[idx] - if i > 0 and lab == labels[_as[i-1]]: - counter += 1 - else: - counter = 1 - if lab < 0: - continue - - for j in range(K): - out[idx, j] = counter + dups += 1 + sum_ranks += i - grp_start + 1 + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + + if (i == N - 1 or ( + (values[_as[i], 0] != values[_as[i+1], 0]) and not + (isnan(values[_as[i], 0]) and + isnan(values[_as[i+1], 0]) + ))): + dups = sum_ranks = 0 + val_start = i + + if i == 0 or labels[_as[i]] != labels[_as[i-1]]: + grp_start = i {{endfor}} diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 428c5b4657b88..bfd87c0a50727 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1770,10 +1770,10 @@ def cumcount(self, ascending=True): @Substitution(name='groupby') @Appender(_doc_template) - def rank(self, ties_method='average', ascending=True, na_option='keep', + def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0): """Rank within each group""" - return self._cython_transform('rank', ties_method=ties_method, + return self._cython_transform('rank', ties_method=method, ascending=ascending, na_option=na_option, pct=pct, axis=axis) From baeb1921ab7b8c1b2be90f68d96a313dedfe9b81 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 10:58:06 -0800 Subject: [PATCH 05/35] Added remaining tiebreakers; fixed int/float dtype mixup --- pandas/_libs/groupby.pyx | 2 +- pandas/_libs/groupby_helper.pxi.in | 18 ++++++++++++++++-- pandas/core/groupby.py | 14 ++++++++++---- pandas/tests/groupby/test_groupby.py | 1 - 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index af32a617c3a63..31fba364dc418 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -18,7 +18,7 @@ from libc.stdlib cimport malloc, free from util cimport numeric, get_nat from algos cimport (swap, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, - TIEBREAK_FIRST, TIEBREAK_DENSE) + TIEBREAK_FIRST, TIEBREAK_FIRST_DESCENDING, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 4e51b22536edd..ffd0feaa28b1c 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -446,7 +446,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, +def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, bint is_datetimelike, **kwargs): @@ -472,7 +472,21 @@ def group_rank_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups - + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = 2 * (i - grp_start) - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = val_start - grp_start if (i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not (isnan(values[_as[i], 0]) and diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bfd87c0a50727..e3fa31aafa8d5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1007,7 +1007,10 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): continue except AssertionError as e: raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) + if self._transform_should_cast(how): + output[name] = self._try_cast(result, obj) + else: + output[name] = result if len(output) == 0: raise DataError('No numeric types to aggregate') @@ -2325,10 +2328,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, else: raise - if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + if how == 'rank': + out_dtype = 'float' else: - out_dtype = 'object' + if is_numeric: + out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + else: + out_dtype = 'object' labels, _, _ = self.group_info diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ba9f3da022150..67bd51208d28f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1945,7 +1945,6 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): df = DataFrame({'key': ['foo']*5, 'val': vals}) result = df.groupby('key').rank(method=ties_method, ascending=ascending, pct=pct) - assert_frame_equal(result, exp) @pytest.mark.parametrize("vals", [ From 07c8e0f13f820e0431f58cce77a2fa2734edcca1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 11:19:32 -0800 Subject: [PATCH 06/35] Added func for obj support --- pandas/_libs/groupby.pyx | 53 ++++++++++++++++++++++++++++++++++++++++ pandas/core/groupby.py | 2 +- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 31fba364dc418..72aace3ffb1cc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -125,6 +125,59 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank_object(ndarray[float64_t, ndim=2] out, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike, **kwargs): + """ + Only transforms on axis=0 + """ + cdef: + int tiebreak + Py_ssize_t i, j, N, K + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 + ndarray[int64_t] _as + + tiebreak = tiebreakers[kwargs['ties_method']] + N, K = ( values).shape + + _as = np.lexsort((values[:, 0], labels)) + + for i in range(N): + dups += 1 + sum_ranks += i - grp_start + 1 + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = 2 * (i - grp_start) - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = val_start - grp_start + + if (i == N - 1 or ( + (values[_as[i], 0] != values[_as[i+1], 0]) and not + (values[_as[i], 0] is np.nan and values[_as[i+1], 0] is np.nan) + )): + dups = sum_ranks = 0 + val_start = i + + if i == 0 or labels[_as[i]] != labels[_as[i-1]]: + grp_start = i + cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e3fa31aafa8d5..547eed4ea6bab 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1776,7 +1776,7 @@ def cumcount(self, ascending=True): def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0): """Rank within each group""" - return self._cython_transform('rank', ties_method=method, + return self._cython_transform('rank', numeric_only=False, ties_method=method, ascending=ascending, na_option=na_option, pct=pct, axis=axis) From 2ba664344628e4c6ca69a3897628fa5c8bb091b4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 11:43:57 -0800 Subject: [PATCH 07/35] Added pct support --- pandas/_libs/groupby.pyx | 9 +++++++-- pandas/_libs/groupby_helper.pxi.in | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 72aace3ffb1cc..6c03216e1a339 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,8 +139,10 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 ndarray[int64_t] _as + bint pct tiebreak = tiebreakers[kwargs['ties_method']] + pct = kwargs['pct'] N, K = ( values).shape _as = np.lexsort((values[:, 0], labels)) @@ -175,8 +177,11 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, dups = sum_ranks = 0 val_start = i - if i == 0 or labels[_as[i]] != labels[_as[i-1]]: - grp_start = i + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if pct: + for j in range(grp_start, i + 1): + out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1) + grp_start = i + 1 cdef inline float64_t median_linear(float64_t* a, int n) nogil: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index ffd0feaa28b1c..968487df797d4 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -444,6 +444,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, @@ -458,8 +459,10 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 ndarray[int64_t] _as + bint pct tiebreak = tiebreakers[kwargs['ties_method']] + pct = kwargs['pct'] N, K = ( values).shape _as = np.lexsort((values[:, 0], labels)) @@ -495,8 +498,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups = sum_ranks = 0 val_start = i - if i == 0 or labels[_as[i]] != labels[_as[i-1]]: - grp_start = i + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if pct: + for j in range(grp_start, i + 1): + out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1) + grp_start = i + 1 {{endfor}} From 4e54aa501e34a451d1e45ff82fd813186895a43e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 12:37:32 -0800 Subject: [PATCH 08/35] Added support for sorting --- pandas/_libs/groupby.pyx | 6 +++++- pandas/_libs/groupby_helper.pxi.in | 14 +++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6c03216e1a339..b4e2d6f200261 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,14 +139,18 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 ndarray[int64_t] _as - bint pct + bint pct, ascending tiebreak = tiebreakers[kwargs['ties_method']] pct = kwargs['pct'] + ascending = kwargs['ascending'] N, K = ( values).shape _as = np.lexsort((values[:, 0], labels)) + if not ascending: + _as = _as[::-1] + for i in range(N): dups += 1 sum_ranks += i - grp_start + 1 diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 968487df797d4..77cab9841e7c0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -459,14 +459,18 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 ndarray[int64_t] _as - bint pct + bint pct, ascending tiebreak = tiebreakers[kwargs['ties_method']] + ascending = kwargs['ascending'] pct = kwargs['pct'] N, K = ( values).shape _as = np.lexsort((values[:, 0], labels)) + if not ascending: + _as = _as[::-1] + with nogil: for i in range(N): dups += 1 @@ -483,10 +487,10 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, out[_as[j], 0] = i - grp_start + 1 elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = 2 * (i - grp_start) - j - dups + 2 + if ascending: + out[_as[j], 0] = j + 1 + else: + out[_as[j], 0] = 2 * i - j - dups + 2 elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = val_start - grp_start From 428d32c53cadc060e1c5b9cc43f56d410ece1b9a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 12:52:21 -0800 Subject: [PATCH 09/35] Working tests (excl missing data) --- pandas/_libs/groupby.pyx | 13 +++++++------ pandas/_libs/groupby_helper.pxi.in | 7 +++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b4e2d6f200261..e2eff49c072a2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -137,7 +137,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, cdef: int tiebreak Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 ndarray[int64_t] _as bint pct, ascending @@ -166,13 +166,13 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, out[_as[j], 0] = i - grp_start + 1 elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = 2 * (i - grp_start) - j - dups + 2 + if ascending: + out[_as[j], 0] = j + 1 + else: + out[_as[j], 0] = 2 * i - j - dups + 2 elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = val_start - grp_start + out[_as[j], 0] = vals_seen if (i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not @@ -180,6 +180,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, )): dups = sum_ranks = 0 val_start = i + vals_seen += 1 if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 77cab9841e7c0..c91e690e16027 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -457,7 +457,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, cdef: int tiebreak Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0 + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 ndarray[int64_t] _as bint pct, ascending @@ -493,7 +493,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, out[_as[j], 0] = 2 * i - j - dups + 2 elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = val_start - grp_start + out[_as[j], 0] = vals_seen + if (i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not (isnan(values[_as[i], 0]) and @@ -501,12 +502,14 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ))): dups = sum_ranks = 0 val_start = i + vals_seen += 1 if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: for j in range(grp_start, i + 1): out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1) grp_start = i + 1 + vals_seen = 1 {{endfor}} From 902ef3c98d918e259f439f8dc25b52f53ce1ad65 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 1 Feb 2018 12:57:33 -0800 Subject: [PATCH 10/35] Added Timestamps to tests --- pandas/tests/groupby/test_groupby.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 67bd51208d28f..fe02b6f38e907 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1896,7 +1896,10 @@ def test_rank_apply(self): assert_series_equal(result, expected) @pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], ['bar', 'bar', 'foo', 'bar', 'baz']]) + [2, 2, 8, 2, 6], ['bar', 'bar', 'foo', 'bar', 'baz'], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ ('average', True, False, DataFrame( [2., 2., 5., 2., 4.], columns=['val'])), @@ -1949,8 +1952,10 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan] # objects - ]) + ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan]]) @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ ('average', True, 'keep', False, DataFrame( [2., 2., np.nan, 5., 2., 4., np.nan, np.nan], columns=['val'])), @@ -2005,7 +2010,8 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): ('average', False, 'no_na', False, DataFrame( [4., 4., 7.0, 1., 4., 2., 7.0, 7.0], columns=['val'])), ('average', False, 'no_na', True, DataFrame( - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], columns=['val'])), + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + columns=['val'])), ('min', True, 'no_na', False, DataFrame( [1., 1., 6., 5., 1., 4., 6., 6.], columns=['val'])), ('min', True, 'no_na', True, DataFrame( From ecd4b51e3f89a7d81fe6e06112cb48922b28b78a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 08:36:43 -0800 Subject: [PATCH 11/35] Working rank with numeric and missing --- pandas/_libs/groupby.pyx | 59 ++++++++++++++-------- pandas/_libs/groupby_helper.pxi.in | 75 +++++++++++++++++++--------- pandas/tests/groupby/test_groupby.py | 9 ++-- 3 files changed, 95 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e2eff49c072a2..69d63288148e4 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,6 +26,8 @@ cdef int64_t iNaT = get_nat() cdef double NaN = np.NaN cdef double nan = NaN +import missing + # TODO: aggregate multiple columns in single pass # ---------------------------------------------------------------------- @@ -142,11 +144,25 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, bint pct, ascending tiebreak = tiebreakers[kwargs['ties_method']] - pct = kwargs['pct'] ascending = kwargs['ascending'] + pct = kwargs['pct'] + keep_na = kwargs['na_option'] == 'keep' N, K = ( values).shape - _as = np.lexsort((values[:, 0], labels)) + vals = np.array(values[:, 0], copy=True) + mask = missing.isnaobj(vals) + + try: + _as = np.lexsort((vals, labels)) + except TypeError: + # lexsort fails when missing data and objects are mixed + # fallback to argsort + order = (vals, mask, labels) + _values = np.asarray(list(zip(order[0], order[1], order[2])), + dtype=[('values', 'O'), ('mask', '?'), + ('labels', 'i8')]) + _as = np.argsort(_values, kind='mergesort', order=('labels', + 'mask', 'values')) if not ascending: _as = _as[::-1] @@ -155,24 +171,27 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = vals_seen + if keep_na and mask[_as[i]]: + out[_as[i], 0] = np.nan + else: + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 + else: + out[_as[j], 0] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = vals_seen if (i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index c91e690e16027..8e2df60ed7e78 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -458,15 +458,35 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, int tiebreak Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 + int64_t grp_na_count=0 ndarray[int64_t] _as - bint pct, ascending + ndarray[{{c_type}}] _values + ndarray[uint8_t] mask + bint pct, ascending, keep_na tiebreak = tiebreakers[kwargs['ties_method']] ascending = kwargs['ascending'] pct = kwargs['pct'] + keep_na = kwargs['na_option'] == 'keep' N, K = ( values).shape - _as = np.lexsort((values[:, 0], labels)) + _values = np.array(values[:, 0], copy=True) + + mask = np.isnan(_values).astype(np.uint8) + {{if name == 'int64' }} + order = (_values, labels) + {{else}} + if ascending ^ (kwargs['na_option'] == 'top'): + nan_value = np.inf + order = (_values, mask, labels) + else: + nan_value = -np.inf + order = (_values, ~mask, labels) + np.putmask(_values, mask, nan_value) + {{endif}} + + _as = np.lexsort(order) + if not ascending: _as = _as[::-1] @@ -476,38 +496,45 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = vals_seen + if keep_na and (values[_as[i], 0] != values[_as[i], 0]): + grp_na_count += 1 + out[_as[i], 0] = {{nan_val}} + else: + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 + else: + out[_as[j], 0] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = vals_seen if (i == N - 1 or ( - (values[_as[i], 0] != values[_as[i+1], 0]) and not - (isnan(values[_as[i], 0]) and - isnan(values[_as[i+1], 0]) + (_values[_as[i]] != _values[_as[i+1]]) and not + (isnan(_values[_as[i]]) and + isnan(_values[_as[i+1]]) ))): dups = sum_ranks = 0 val_start = i vals_seen += 1 + # Move to the next group, cleaning up any values if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: for j in range(grp_start, i + 1): - out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1) + out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1 + - grp_na_count) + grp_na_count = 0 grp_start = i + 1 vals_seen = 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fe02b6f38e907..0ba4a46271fdf 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1952,10 +1952,11 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06'), np.nan, np.nan]]) + #['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects + #[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + # pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + # pd.Timestamp('2018-01-06'), np.nan, np.nan] + ]) @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ ('average', True, 'keep', False, DataFrame( [2., 2., np.nan, 5., 2., 4., np.nan, np.nan], columns=['val'])), From e17433d1c204af9523242913b8f469c9feaa9682 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 09:10:22 -0800 Subject: [PATCH 12/35] Added missing obj support --- pandas/_libs/groupby.pyx | 36 ++++++++++++++++++---------- pandas/_libs/groupby_helper.pxi.in | 3 +-- pandas/tests/groupby/test_groupby.py | 2 +- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 69d63288148e4..7ee23b4576d53 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -140,8 +140,10 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, int tiebreak Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 + int64_t grp_na_count=0 ndarray[int64_t] _as - bint pct, ascending + ndarray[object] _values + bint pct, ascending, keep_na tiebreak = tiebreakers[kwargs['ties_method']] ascending = kwargs['ascending'] @@ -149,20 +151,26 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, keep_na = kwargs['na_option'] == 'keep' N, K = ( values).shape - vals = np.array(values[:, 0], copy=True) - mask = missing.isnaobj(vals) + _values = np.array(values[:, 0], copy=True) + mask = missing.isnaobj(_values) + if ascending ^ (kwargs['na_option'] == 'top'): + nan_value = np.inf + order = (_values, mask, labels) + else: + nan_value = -np.inf + order = (_values, ~mask, labels) + np.putmask(_values, mask, nan_value) try: - _as = np.lexsort((vals, labels)) + _as = np.lexsort(order) except TypeError: # lexsort fails when missing data and objects are mixed # fallback to argsort - order = (vals, mask, labels) - _values = np.asarray(list(zip(order[0], order[1], order[2])), - dtype=[('values', 'O'), ('mask', '?'), - ('labels', 'i8')]) - _as = np.argsort(_values, kind='mergesort', order=('labels', - 'mask', 'values')) + _arr = np.asarray(list(zip(order[0], order[1], order[2])), + dtype=[('values', 'O'), ('mask', '?'), + ('labels', 'i8')]) + _as = np.argsort(_arr, kind='mergesort', order=('labels', + 'mask', 'values')) if not ascending: _as = _as[::-1] @@ -171,7 +179,8 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - if keep_na and mask[_as[i]]: + if keep_na and (values[_as[i], 0] != values[_as[i], 0]): + grp_na_count += 1 out[_as[i], 0] = np.nan else: if tiebreak == TIEBREAK_AVERAGE: @@ -204,8 +213,11 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: for j in range(grp_start, i + 1): - out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1) + out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1 + - grp_na_count) + grp_na_count = 0 grp_start = i + 1 + vals_seen = 1 cdef inline float64_t median_linear(float64_t* a, int n) nogil: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 8e2df60ed7e78..bb68942895dfb 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -471,8 +471,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, N, K = ( values).shape _values = np.array(values[:, 0], copy=True) - mask = np.isnan(_values).astype(np.uint8) + {{if name == 'int64' }} order = (_values, labels) {{else}} @@ -487,7 +487,6 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, _as = np.lexsort(order) - if not ascending: _as = _as[::-1] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0ba4a46271fdf..0e8a2901448de 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1952,7 +1952,7 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - #['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects + ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects #[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, # pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), # pd.Timestamp('2018-01-06'), np.nan, np.nan] From b0ea557502650616040dc8b05d5895a17bbaa0ed Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 11:42:46 -0800 Subject: [PATCH 13/35] Added support for timestamps mixed with nan --- pandas/_libs/groupby_helper.pxi.in | 29 ++++++++++++++++++++++------ pandas/tests/groupby/test_groupby.py | 6 +++--- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index bb68942895dfb..da1a4b90cfa25 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -463,6 +463,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[{{c_type}}] _values ndarray[uint8_t] mask bint pct, ascending, keep_na + {{c_type}} nan_value tiebreak = tiebreakers[kwargs['ties_method']] ascending = kwargs['ascending'] @@ -471,19 +472,27 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, N, K = ( values).shape _values = np.array(values[:, 0], copy=True) + {{if name=='int64'}} + mask = (_values == {{nan_val}}).astype(np.uint8) + {{else}} mask = np.isnan(_values).astype(np.uint8) + {{endif}} - {{if name == 'int64' }} - order = (_values, labels) - {{else}} if ascending ^ (kwargs['na_option'] == 'top'): + {{if name == 'int64'}} + nan_value = np.iinfo(np.int64).max + {{else}} nan_value = np.inf + {{endif}} order = (_values, mask, labels) else: + {{if name == 'int64'}} + nan_value = np.iinfo(np.int64).min + {{else}} nan_value = -np.inf + {{endif}} order = (_values, ~mask, labels) np.putmask(_values, mask, nan_value) - {{endif}} _as = np.lexsort(order) @@ -495,9 +504,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - if keep_na and (values[_as[i], 0] != values[_as[i], 0]): + if keep_na and _values[_as[i]] == nan_value: grp_na_count += 1 - out[_as[i], 0] = {{nan_val}} + out[_as[i], 0] = nan else: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -518,11 +527,19 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for j in range(i - dups + 1, i + 1): out[_as[j], 0] = vals_seen + {{if name=='int64'}} + if (i == N - 1 or ( + (_values[_as[i]] != _values[_as[i+1]]) and not + (_values[_as[i]] == nan_value and + _values[_as[i+1]] == nan_value + ))): + {{else}} if (i == N - 1 or ( (_values[_as[i]] != _values[_as[i+1]]) and not (isnan(_values[_as[i]]) and isnan(_values[_as[i+1]]) ))): + {{endif}} dups = sum_ranks = 0 val_start = i vals_seen += 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0e8a2901448de..9fcef34715415 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1953,9 +1953,9 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects - #[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - # pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - # pd.Timestamp('2018-01-06'), np.nan, np.nan] + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] ]) @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ ('average', True, 'keep', False, DataFrame( From e15b4b2cdec233d01196cbec6e6d1081c50522d9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 12:16:54 -0800 Subject: [PATCH 14/35] Added tests for multiple groups --- pandas/tests/groupby/test_groupby.py | 249 ++++++++++++--------------- 1 file changed, 108 insertions(+), 141 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9fcef34715415..d7f1aac59f91d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1895,61 +1895,49 @@ def test_rank_apply(self): expected = expected.reindex(result.index) assert_series_equal(result, expected) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ [2, 2, 8, 2, 6], ['bar', 'bar', 'foo', 'bar', 'baz'], [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06')]]) @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ - ('average', True, False, DataFrame( - [2., 2., 5., 2., 4.], columns=['val'])), - ('average', True, True, DataFrame( - [0.4, 0.4, 1.0, 0.4, 0.8], columns=['val'])), - ('average', False, False, DataFrame( - [4., 4., 1., 4., 2.], columns=['val'])), - ('average', False, True, DataFrame( - [.8, .8, .2, .8, .4], columns=['val'])), - ('min', True, False, DataFrame( - [1., 1., 5., 1., 4.], columns=['val'])), - ('min', True, True, DataFrame( - [0.2, 0.2, 1.0, 0.2, 0.8], columns=['val'])), - ('min', False, False, DataFrame( - [3., 3., 1., 3., 2.], columns=['val'])), - ('min', False, True, DataFrame( - [.6, .6, .2, .6, .4], columns=['val'])), - ('max', True, False, DataFrame( - [3., 3., 5., 3., 4.], columns=['val'])), - ('max', True, True, DataFrame( - [0.6, 0.6, 1.0, 0.6, 0.8], columns=['val'])), - ('max', False, False, DataFrame( - [5., 5., 1., 5., 2.], columns=['val'])), - ('max', False, True, DataFrame( - [1., 1., .2, 1., .4], columns=['val'])), - ('first', True, False, DataFrame( - [1., 2., 5., 3., 4.], columns=['val'])), - ('first', True, True, DataFrame( - [0.2, 0.4, 1.0, 0.6, 0.8], columns=['val'])), - ('first', False, False, DataFrame( - [3., 4., 1., 5., 2.], columns=['val'])), - ('first', False, True, DataFrame( - [.6, .8, .2, 1., .4], columns=['val'])), - ('dense', True, False, DataFrame( - [1., 1., 3., 1., 2.], columns=['val'])), - ('dense', True, True, DataFrame( - [0.2, 0.2, 0.6, 0.2, 0.4], columns=['val'])), - ('dense', False, False, DataFrame( - [3., 3., 1., 3., 2.], columns=['val'])), - ('dense', False, True, DataFrame( - [.6, .6, .2, .6, .4], columns=['val'])), + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), ]) - def test_rank_args(self, vals, ties_method, ascending, pct, exp): + def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): if ties_method == 'first' and vals[0] == 'bar': pytest.xfail("See GH 19482") - df = DataFrame({'key': ['foo']*5, 'val': vals}) + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) result = df.groupby('key').rank(method=ties_method, ascending=ascending, pct=pct) - assert_frame_equal(result, exp) + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects @@ -1958,110 +1946,89 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp): pd.Timestamp('2018-01-06'), np.nan, np.nan] ]) @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ - ('average', True, 'keep', False, DataFrame( - [2., 2., np.nan, 5., 2., 4., np.nan, np.nan], columns=['val'])), - ('average', True, 'keep', True, DataFrame( - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], - columns=['val'])), - ('average', False, 'keep', False, DataFrame( - [4., 4., np.nan, 1., 4., 2., np.nan, np.nan], columns=['val'])), - ('average', False, 'keep', True, DataFrame( - [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], columns=['val'])), - ('min', True, 'keep', False, DataFrame( - [1., 1., np.nan, 5., 1., 4., np.nan, np.nan], columns=['val'])), - ('min', True, 'keep', True, DataFrame( - [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan], - columns=['val'])), - ('min', False, 'keep', False, DataFrame( - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan], columns=['val'])), - ('min', False, 'keep', True, DataFrame( - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan], columns=['val'])), - ('max', True, 'keep', False, DataFrame( - [3., 3., np.nan, 5., 3., 4., np.nan, np.nan], columns=['val'])), - ('max', True, 'keep', True, DataFrame( - [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], - columns=['val'])), - ('max', False, 'keep', False, DataFrame( - [5., 5., np.nan, 1., 5., 2., np.nan, np.nan], columns=['val'])), - ('max', False, 'keep', True, DataFrame( - [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan], columns=['val'])), - ('first', True, 'keep', False, DataFrame( - [1., 2., np.nan, 5., 3., 4., np.nan, np.nan], columns=['val'])), - ('first', True, 'keep', True, DataFrame( - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], - columns=['val'])), - ('first', False, 'keep', False, DataFrame( - [3., 4., np.nan, 1., 5., 2., np.nan, np.nan], columns=['val'])), - ('first', False, 'keep', True, DataFrame( - [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan], columns=['val'])), - ('dense', True, 'keep', False, DataFrame( - [1., 1., np.nan, 3., 1., 2., np.nan, np.nan], columns=['val'])), - ('dense', True, 'keep', True, DataFrame( - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan], - columns=['val'])), - ('dense', False, 'keep', False, DataFrame( - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan], columns=['val'])), - ('dense', False, 'keep', True, DataFrame( - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan], columns=['val'])), - ('average', True, 'no_na', False, DataFrame( - [2., 2., 7., 5., 2., 4., 7., 7.], columns=['val'])), - ('average', True, 'no_na', True, DataFrame( - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], - columns=['val'])), - ('average', False, 'no_na', False, DataFrame( - [4., 4., 7.0, 1., 4., 2., 7.0, 7.0], columns=['val'])), - ('average', False, 'no_na', True, DataFrame( - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], - columns=['val'])), - ('min', True, 'no_na', False, DataFrame( - [1., 1., 6., 5., 1., 4., 6., 6.], columns=['val'])), - ('min', True, 'no_na', True, DataFrame( - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], - columns=['val'])), - ('min', False, 'no_na', False, DataFrame( - [3., 3., 6., 1., 3., 2., 6., 6.], columns=['val'])), - ('min', False, 'no_na', True, DataFrame( - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], - columns=['val'])), - ('max', True, 'no_na', False, DataFrame( - [3., 3., 8., 5., 3., 4., 8., 8.], columns=['val'])), - ('max', True, 'no_na', True, DataFrame( - [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.], columns=['val'])), - ('max', False, 'no_na', False, DataFrame( - [5., 5., 8., 1., 5., 2., 8., 8.], columns=['val'])), - ('max', False, 'no_na', True, DataFrame( - [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.], columns=['val'])), - ('first', True, 'no_na', False, DataFrame( - [1., 2., 6., 5., 3., 4., 7., 8.], columns=['val'])), - ('first', True, 'no_na', True, DataFrame( - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.], - columns=['val'])), - ('first', False, 'no_na', False, DataFrame( - [3., 4., 6., 1., 5., 2., 7., 8.], columns=['val'])), - ('first', False, 'no_na', True, DataFrame( - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.], - columns=['val'])), - ('dense', True, 'no_na', False, DataFrame( - [1., 1., 4., 3., 1., 2., 4., 4.], columns=['val'])), - ('dense', True, 'no_na', True, DataFrame( - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5], - columns=['val'])), - ('dense', False, 'no_na', False, DataFrame( - [3., 3., 4., 1., 3., 2., 4., 4.], columns=['val'])), - ('dense', False, 'no_na', True, DataFrame( - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5], - columns=['val'])), + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) ]) - def test_rank_args_missing(self, vals, ties_method, ascending, na_option, - pct, exp): + def test_rank_args_missing(self, grps, vals, ties_method, ascending, + na_option, pct, exp): if ties_method == 'first' and vals[0] == 'bar': pytest.xfail("See GH 19482") - - df = DataFrame({'key': ['foo']*8, 'val': vals}) + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) result = df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) - assert_frame_equal(result, exp) + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], From 04eb4f1126bc38cfa1efe04c78851028cb4c9106 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 12:31:29 -0800 Subject: [PATCH 15/35] Fixed bug with First tiebreak across multiple groups --- pandas/_libs/groupby.pyx | 6 +++--- pandas/_libs/groupby_helper.pxi.in | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7ee23b4576d53..cb4a05fbe2ff2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -18,7 +18,7 @@ from libc.stdlib cimport malloc, free from util cimport numeric, get_nat from algos cimport (swap, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, - TIEBREAK_FIRST, TIEBREAK_FIRST_DESCENDING, TIEBREAK_DENSE) + TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() @@ -195,9 +195,9 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: - out[_as[j], 0] = j + 1 + out[_as[j], 0] = j + 1 - grp_start else: - out[_as[j], 0] = 2 * i - j - dups + 2 + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = vals_seen diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index da1a4b90cfa25..01b6ea984a247 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -520,9 +520,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: - out[_as[j], 0] = j + 1 + out[_as[j], 0] = j + 1 - grp_start else: - out[_as[j], 0] = 2 * i - j - dups + 2 + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = vals_seen From 7a4602d549796861e85d8ff863acfe8b538aeb02 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 13:06:35 -0800 Subject: [PATCH 16/35] Variable Name Cleanup --- pandas/_libs/groupby.pyx | 22 +++++++------- pandas/_libs/groupby_helper.pxi.in | 46 +++++++++++++++--------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index cb4a05fbe2ff2..5b29c2072a64a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,7 +139,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, cdef: int tiebreak Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 int64_t grp_na_count=0 ndarray[int64_t] _as ndarray[object] _values @@ -151,16 +151,16 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, keep_na = kwargs['na_option'] == 'keep' N, K = ( values).shape - _values = np.array(values[:, 0], copy=True) - mask = missing.isnaobj(_values) + masked_vals = np.array(values[:, 0], copy=True) + mask = missing.isnaobj(masked_vals) if ascending ^ (kwargs['na_option'] == 'top'): - nan_value = np.inf - order = (_values, mask, labels) + nan_fill_val = np.inf + order = (masked_vals, mask, labels) else: - nan_value = -np.inf - order = (_values, ~mask, labels) - np.putmask(_values, mask, nan_value) + nan_fill_val = -np.inf + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) try: _as = np.lexsort(order) except TypeError: @@ -200,7 +200,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = vals_seen + out[_as[j], 0] = grp_vals_seen if (i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not @@ -208,7 +208,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, )): dups = sum_ranks = 0 val_start = i - vals_seen += 1 + grp_vals_seen += 1 if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: @@ -217,7 +217,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, - grp_na_count) grp_na_count = 0 grp_start = i + 1 - vals_seen = 1 + grp_vals_seen = 1 cdef inline float64_t median_linear(float64_t* a, int n) nogil: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 01b6ea984a247..3d8752b8e08a2 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -457,13 +457,13 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, cdef: int tiebreak Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1 + int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 int64_t grp_na_count=0 ndarray[int64_t] _as - ndarray[{{c_type}}] _values + ndarray[{{c_type}}] masked_vals ndarray[uint8_t] mask bint pct, ascending, keep_na - {{c_type}} nan_value + {{c_type}} nan_fill_val tiebreak = tiebreakers[kwargs['ties_method']] ascending = kwargs['ascending'] @@ -471,28 +471,28 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, keep_na = kwargs['na_option'] == 'keep' N, K = ( values).shape - _values = np.array(values[:, 0], copy=True) + masked_vals = np.array(values[:, 0], copy=True) {{if name=='int64'}} - mask = (_values == {{nan_val}}).astype(np.uint8) + mask = (masked_vals == {{nan_val}}).astype(np.uint8) {{else}} - mask = np.isnan(_values).astype(np.uint8) + mask = np.isnan(masked_vals).astype(np.uint8) {{endif}} if ascending ^ (kwargs['na_option'] == 'top'): {{if name == 'int64'}} - nan_value = np.iinfo(np.int64).max + nan_fill_val = np.iinfo(np.int64).max {{else}} - nan_value = np.inf + nan_fill_val = np.inf {{endif}} - order = (_values, mask, labels) + order = (masked_vals, mask, labels) else: {{if name == 'int64'}} - nan_value = np.iinfo(np.int64).min + nan_fill_val = np.iinfo(np.int64).min {{else}} - nan_value = -np.inf + nan_fill_val = -np.inf {{endif}} - order = (_values, ~mask, labels) - np.putmask(_values, mask, nan_value) + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) _as = np.lexsort(order) @@ -504,7 +504,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - if keep_na and _values[_as[i]] == nan_value: + if keep_na and masked_vals[_as[i]] == nan_fill_val: grp_na_count += 1 out[_as[i], 0] = nan else: @@ -525,24 +525,24 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = vals_seen + out[_as[j], 0] = grp_vals_seen {{if name=='int64'}} if (i == N - 1 or ( - (_values[_as[i]] != _values[_as[i+1]]) and not - (_values[_as[i]] == nan_value and - _values[_as[i+1]] == nan_value + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not + (masked_vals[_as[i]] == nan_fill_val and + masked_vals[_as[i+1]] == nan_fill_val ))): {{else}} if (i == N - 1 or ( - (_values[_as[i]] != _values[_as[i+1]]) and not - (isnan(_values[_as[i]]) and - isnan(_values[_as[i+1]]) + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not + (isnan(masked_vals[_as[i]]) and + isnan(masked_vals[_as[i+1]]) ))): {{endif}} dups = sum_ranks = 0 val_start = i - vals_seen += 1 + grp_vals_seen += 1 # Move to the next group, cleaning up any values if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: @@ -552,7 +552,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, - grp_na_count) grp_na_count = 0 grp_start = i + 1 - vals_seen = 1 + grp_vals_seen = 1 {{endfor}} From 7be3bf32bd68ff6bbfcfd4a398ce3a46b92bf5d3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 18:24:10 -0800 Subject: [PATCH 17/35] Converted kwargs to positional arguments in Cython layer --- pandas/_libs/groupby.pyx | 13 ++++++------- pandas/_libs/groupby_helper.pxi.in | 13 ++++++------- pandas/core/groupby.py | 13 +++++++++++-- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5b29c2072a64a..21ef9d0fe351a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -132,7 +132,8 @@ def group_last_object(ndarray[object, ndim=2] out, def group_rank_object(ndarray[float64_t, ndim=2] out, ndarray[object, ndim=2] values, ndarray[int64_t] labels, - bint is_datetimelike, **kwargs): + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): """ Only transforms on axis=0 """ @@ -143,18 +144,16 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, int64_t grp_na_count=0 ndarray[int64_t] _as ndarray[object] _values - bint pct, ascending, keep_na + bint keep_na - tiebreak = tiebreakers[kwargs['ties_method']] - ascending = kwargs['ascending'] - pct = kwargs['pct'] - keep_na = kwargs['na_option'] == 'keep' + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' N, K = ( values).shape masked_vals = np.array(values[:, 0], copy=True) mask = missing.isnaobj(masked_vals) - if ascending ^ (kwargs['na_option'] == 'top'): + if ascending ^ (na_option == 'top'): nan_fill_val = np.inf order = (masked_vals, mask, labels) else: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 3d8752b8e08a2..50df481928b47 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -450,7 +450,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - bint is_datetimelike, **kwargs): + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): """ Only transforms on axis=0 """ @@ -462,13 +463,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[int64_t] _as ndarray[{{c_type}}] masked_vals ndarray[uint8_t] mask - bint pct, ascending, keep_na + bint keep_na {{c_type}} nan_fill_val - tiebreak = tiebreakers[kwargs['ties_method']] - ascending = kwargs['ascending'] - pct = kwargs['pct'] - keep_na = kwargs['na_option'] == 'keep' + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' N, K = ( values).shape masked_vals = np.array(values[:, 0], copy=True) @@ -478,7 +477,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, mask = np.isnan(masked_vals).astype(np.uint8) {{endif}} - if ascending ^ (kwargs['na_option'] == 'top'): + if ascending ^ (na_option == 'top'): {{if name == 'int64'}} nan_fill_val = np.iinfo(np.int64).max {{else}} diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 547eed4ea6bab..bf8cb9734c644 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2171,6 +2171,12 @@ def get_group_levels(self): # ------------------------------------------------------------ # Aggregation functions + def _group_rank_wrapper(func, *args, **kwargs): + return func(*args, kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep')) + _cython_functions = { 'aggregate': { 'add': 'group_add', @@ -2195,7 +2201,10 @@ def get_group_levels(self): 'cumsum': 'group_cumsum', 'cummin': 'group_cummin', 'cummax': 'group_cummax', - 'rank': 'group_rank', + 'rank': { + 'name': 'group_rank', + 'f': _group_rank_wrapper + } } } @@ -2424,7 +2433,7 @@ def _transform(self, result, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, is_datetimelike) + comp_ids, is_datetimelike, **kwargs) else: transform_func(result, values, comp_ids, is_datetimelike, **kwargs) From ca28350b7f252f07111fc11fd130ea314153ffc6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 5 Feb 2018 18:27:08 -0800 Subject: [PATCH 18/35] Lint fixes --- pandas/core/groupby.py | 9 +++++---- pandas/tests/groupby/test_groupby.py | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bf8cb9734c644..b0a76da89a5d2 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1002,7 +1002,8 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): continue try: - result, names = self.grouper.transform(obj.values, how, **kwargs) + result, names = self.grouper.transform(obj.values, how, + **kwargs) except NotImplementedError: continue except AssertionError as e: @@ -1776,9 +1777,9 @@ def cumcount(self, ascending=True): def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0): """Rank within each group""" - return self._cython_transform('rank', numeric_only=False, ties_method=method, - ascending=ascending, na_option=na_option, - pct=pct, axis=axis) + return self._cython_transform('rank', numeric_only=False, + ties_method=method, ascending=ascending, + na_option=na_option, pct=pct, axis=axis) @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d7f1aac59f91d..efca59a2cb754 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1930,8 +1930,8 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) vals = vals * len(grps) df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, ascending=ascending, - pct=pct) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) exp_df = DataFrame(exp * len(grps), columns=['val']) assert_frame_equal(result, exp_df) @@ -2024,7 +2024,8 @@ def test_rank_args_missing(self, grps, vals, ties_method, ascending, key = np.repeat(grps, len(vals)) vals = vals * len(grps) df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, ascending=ascending, + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, na_option=na_option, pct=pct) exp_df = DataFrame(exp * len(grps), columns=['val']) From 913ce94168e6c756f177067ccaaf5970e275e5f8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Feb 2018 09:25:05 -0800 Subject: [PATCH 19/35] Created enum for rank tiebreakers --- pandas/_libs/algos.pxd | 14 ++++----- pandas/_libs/algos.pyx | 18 +++-------- pandas/_libs/algos_rank_helper.pxi.in | 44 +++++++++++++-------------- pandas/_libs/groupby.pyx | 15 +++++---- pandas/_libs/groupby_helper.pxi.in | 12 ++++---- 5 files changed, 47 insertions(+), 56 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 3834a68b67075..a535872ff7279 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -12,10 +12,10 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: b[0] = t return 0 -cdef: - int TIEBREAK_AVERAGE = 0 - int TIEBREAK_MIN = 1 - int TIEBREAK_MAX = 2 - int TIEBREAK_FIRST = 3 - int TIEBREAK_FIRST_DESCENDING = 4 - int TIEBREAK_DENSE = 5 +cdef enum TiebreakEnumType: + TIEBREAK_AVERAGE + TIEBREAK_MIN, + TIEBREAK_MAX + TIEBREAK_FIRST + TIEBREAK_FIRST_DESCENDING + TIEBREAK_DENSE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5d17488963b1c..6dbfb2f5fac89 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -31,20 +31,12 @@ cdef double nan = NaN cdef int64_t iNaT = get_nat() -cdef: - int TIEBREAK_AVERAGE = 0 - int TIEBREAK_MIN = 1 - int TIEBREAK_MAX = 2 - int TIEBREAK_FIRST = 3 - int TIEBREAK_FIRST_DESCENDING = 4 - int TIEBREAK_DENSE = 5 - tiebreakers = { - 'average': TIEBREAK_AVERAGE, - 'min': TIEBREAK_MIN, - 'max': TIEBREAK_MAX, - 'first': TIEBREAK_FIRST, - 'dense': TIEBREAK_DENSE, + 'average': TiebreakEnumType.TIEBREAK_AVERAGE, + 'min': TiebreakEnumType.TIEBREAK_MIN, + 'max': TiebreakEnumType.TIEBREAK_MAX, + 'first': TiebreakEnumType.TIEBREAK_FIRST, + 'dense': TiebreakEnumType.TIEBREAK_DENSE, } diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 2f40bd4349a2e..e7eb6b1544d4c 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -121,11 +121,11 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) _as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) {{else}} - if tiebreak == TIEBREAK_FIRST: + if tiebreak == TiebreakEnumType.TIEBREAK_FIRST: # need to use a stable sort here _as = np.lexsort(keys=order) if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + tiebreak = TiebreakEnumType.TIEBREAK_FIRST_DESCENDING else: _as = np.lexsort(keys=order) {{endif}} @@ -154,21 +154,21 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, if (i == n - 1 or are_diff(util.get_value_at(sorted_data, i + 1), val) or i == non_na_idx - 1): - if tiebreak == TIEBREAK_AVERAGE: + if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: + elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: + elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') - elif tiebreak == TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: + elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: total_tie_count += 1 for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count @@ -191,22 +191,22 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx - 1): - if tiebreak == TIEBREAK_AVERAGE: + if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: + elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: + elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: + elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: total_tie_count += 1 for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count @@ -300,11 +300,11 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', else: return ranks {{else}} - if tiebreak == TIEBREAK_FIRST: + if tiebreak == TiebreakEnumType.TIEBREAK_FIRST: # need to use a stable sort here _as = values.argsort(axis=1, kind='mergesort') if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + tiebreak = TiebreakEnumType.TIEBREAK_FIRST_DESCENDING else: _as = values.argsort(1) {{endif}} @@ -359,16 +359,16 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{else}} if j == k - 1 or values[i, j + 1] != val: {{endif}} - if tiebreak == TIEBREAK_AVERAGE: + if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: + elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: + elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: {{if dtype == 'object'}} raise ValueError('first not supported ' 'for non-numeric data') @@ -376,10 +376,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = z + 1 {{endif}} - elif tiebreak == TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: + elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: total_tie_count += 1 for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 21ef9d0fe351a..253f0c279cf41 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -17,8 +17,7 @@ from libc.math cimport isnan from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport (swap, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, - TIEBREAK_FIRST, TIEBREAK_DENSE) +from algos cimport swap, TiebreakEnumType from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() @@ -138,7 +137,7 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, Only transforms on axis=0 """ cdef: - int tiebreak + TiebreakEnumType tiebreak Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 int64_t grp_na_count=0 @@ -182,22 +181,22 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, grp_na_count += 1 out[_as[i], 0] = np.nan else: - if tiebreak == TIEBREAK_AVERAGE: + if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: + elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: + elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: out[_as[j], 0] = j + 1 - grp_start else: out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: + elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 50df481928b47..ee90fc7339d74 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -456,7 +456,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, Only transforms on axis=0 """ cdef: - int tiebreak + TiebreakEnumType tiebreak Py_ssize_t i, j, N, K int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 int64_t grp_na_count=0 @@ -507,22 +507,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, grp_na_count += 1 out[_as[i], 0] = nan else: - if tiebreak == TIEBREAK_AVERAGE: + if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: + elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: + elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: out[_as[j], 0] = j + 1 - grp_start else: out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: + elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen From 47559416acaeb80bc1628d3ffd1688eada257f73 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Feb 2018 13:57:52 -0800 Subject: [PATCH 20/35] Fixed build errors; Py <3.5 support --- pandas/_libs/algos.pyx | 10 +++--- pandas/_libs/algos_rank_helper.pxi.in | 44 +++++++++++++-------------- pandas/_libs/groupby.pyx | 13 ++++---- pandas/_libs/groupby_helper.pxi.in | 10 +++--- pandas/core/groupby.py | 5 ++- 5 files changed, 43 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6dbfb2f5fac89..a418e54e4da9b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -32,11 +32,11 @@ cdef double nan = NaN cdef int64_t iNaT = get_nat() tiebreakers = { - 'average': TiebreakEnumType.TIEBREAK_AVERAGE, - 'min': TiebreakEnumType.TIEBREAK_MIN, - 'max': TiebreakEnumType.TIEBREAK_MAX, - 'first': TiebreakEnumType.TIEBREAK_FIRST, - 'dense': TiebreakEnumType.TIEBREAK_DENSE, + 'average': TIEBREAK_AVERAGE, + 'min': TIEBREAK_MIN, + 'max': TIEBREAK_MAX, + 'first': TIEBREAK_FIRST, + 'dense': TIEBREAK_DENSE, } diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index e7eb6b1544d4c..2f40bd4349a2e 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -121,11 +121,11 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) _as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) {{else}} - if tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here _as = np.lexsort(keys=order) if not ascending: - tiebreak = TiebreakEnumType.TIEBREAK_FIRST_DESCENDING + tiebreak = TIEBREAK_FIRST_DESCENDING else: _as = np.lexsort(keys=order) {{endif}} @@ -154,21 +154,21 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, if (i == n - 1 or are_diff(util.get_value_at(sorted_data, i + 1), val) or i == non_na_idx - 1): - if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: + if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: + elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: + elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: + elif tiebreak == TIEBREAK_DENSE: total_tie_count += 1 for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count @@ -191,22 +191,22 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx - 1): - if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: + if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: + elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: + elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: + elif tiebreak == TIEBREAK_DENSE: total_tie_count += 1 for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count @@ -300,11 +300,11 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', else: return ranks {{else}} - if tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here _as = values.argsort(axis=1, kind='mergesort') if not ascending: - tiebreak = TiebreakEnumType.TIEBREAK_FIRST_DESCENDING + tiebreak = TIEBREAK_FIRST_DESCENDING else: _as = values.argsort(1) {{endif}} @@ -359,16 +359,16 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{else}} if j == k - 1 or values[i, j + 1] != val: {{endif}} - if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: + if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: + elif tiebreak == TIEBREAK_MIN: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: + elif tiebreak == TIEBREAK_MAX: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + elif tiebreak == TIEBREAK_FIRST: {{if dtype == 'object'}} raise ValueError('first not supported ' 'for non-numeric data') @@ -376,10 +376,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = z + 1 {{endif}} - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST_DESCENDING: + elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: + elif tiebreak == TIEBREAK_DENSE: total_tie_count += 1 for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 253f0c279cf41..f4ea27b28b38a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -17,7 +17,8 @@ from libc.math cimport isnan from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap, TiebreakEnumType +from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, + TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() @@ -181,22 +182,22 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, grp_na_count += 1 out[_as[i], 0] = np.nan else: - if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: + if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: + elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: + elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: out[_as[j], 0] = j + 1 - grp_start else: out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: + elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index ee90fc7339d74..2f4625c2875cf 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -507,22 +507,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, grp_na_count += 1 out[_as[i], 0] = nan else: - if tiebreak == TiebreakEnumType.TIEBREAK_AVERAGE: + if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TiebreakEnumType.TIEBREAK_MIN: + elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TiebreakEnumType.TIEBREAK_MAX: + elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TiebreakEnumType.TIEBREAK_FIRST: + elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: out[_as[j], 0] = j + 1 - grp_start else: out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TiebreakEnumType.TIEBREAK_DENSE: + elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b0a76da89a5d2..a86a1565c6eeb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2173,7 +2173,10 @@ def get_group_levels(self): # Aggregation functions def _group_rank_wrapper(func, *args, **kwargs): - return func(*args, kwargs.get('ties_method', 'average'), + # Need to explicity unpack *args to support Py < 3.5 + # See PEP 448 + return func(args[0], args[1], args[2], args[3], + kwargs.get('ties_method', 'average'), kwargs.get('ascending', True), kwargs.get('pct', False), kwargs.get('na_option', 'keep')) From d4a6662293740efa6e5f04f0356aea84e7e59a8c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Feb 2018 15:32:35 -0800 Subject: [PATCH 21/35] LINT fixes --- pandas/_libs/groupby.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f4ea27b28b38a..eca61189ac46d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -127,6 +127,7 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_rank_object(ndarray[float64_t, ndim=2] out, @@ -201,10 +202,10 @@ def group_rank_object(ndarray[float64_t, ndim=2] out, for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen - if (i == N - 1 or ( + if i == N - 1 or ( (values[_as[i], 0] != values[_as[i+1], 0]) and not (values[_as[i], 0] is np.nan and values[_as[i+1], 0] is np.nan) - )): + ): dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 From 56e7974c50c52b12f40e5d539c9a78328dffd560 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Feb 2018 18:56:04 -0800 Subject: [PATCH 22/35] Fixed isnan reference issue on Windows --- pandas/_libs/groupby.pyx | 4 +++- pandas/_libs/groupby_helper.pxi.in | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index eca61189ac46d..eccfa64812dad 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -13,7 +13,6 @@ from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t) -from libc.math cimport isnan from libc.stdlib cimport malloc, free from util cimport numeric, get_nat @@ -26,6 +25,9 @@ cdef int64_t iNaT = get_nat() cdef double NaN = np.NaN cdef double nan = NaN +cdef extern from "numpy/npy_math.h" nogil: + bint npy_isnan(double x) + import missing diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 2f4625c2875cf..f478c0f3fcbeb 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -535,8 +535,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, {{else}} if (i == N - 1 or ( (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not - (isnan(masked_vals[_as[i]]) and - isnan(masked_vals[_as[i+1]]) + (npy_isnan(masked_vals[_as[i]]) and + npy_isnan(masked_vals[_as[i+1]]) ))): {{endif}} dups = sum_ranks = 0 From 9d7c3e6f160109d597a58882152677c595a952d9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 6 Feb 2018 19:04:28 -0800 Subject: [PATCH 23/35] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 083242cd69b74..cf5a44442045b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -581,6 +581,7 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) +- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) .. _whatsnew_0230.docs: From 178654d39bbe525e063b669791773c464765bba7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Feb 2018 08:34:45 -0800 Subject: [PATCH 24/35] Added GroupBy object raises tests --- pandas/tests/groupby/test_groupby.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index efca59a2cb754..796b3a7192dbc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1898,7 +1898,7 @@ def test_rank_apply(self): @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], ['bar', 'bar', 'foo', 'bar', 'baz'], + [2, 2, 8, 2, 6], [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06')]]) @@ -1925,8 +1925,6 @@ def test_rank_apply(self): ('dense', False, True, [.6, .6, .2, .6, .4]), ]) def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): - if ties_method == 'first' and vals[0] == 'bar': - pytest.xfail("See GH 19482") key = np.repeat(grps, len(vals)) vals = vals * len(grps) df = DataFrame({'key': key, 'val': vals}) @@ -1940,7 +1938,6 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - ['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06'), np.nan, np.nan] @@ -2019,8 +2016,6 @@ def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): ]) def test_rank_args_missing(self, grps, vals, ties_method, ascending, na_option, pct, exp): - if ties_method == 'first' and vals[0] == 'bar': - pytest.xfail("See GH 19482") key = np.repeat(grps, len(vals)) vals = vals * len(grps) df = DataFrame({'key': key, 'val': vals}) @@ -2031,6 +2026,24 @@ def test_rank_args_missing(self, grps, vals, ties_method, ascending, exp_df = DataFrame(exp * len(grps), columns=['val']) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) + @pytest.mark.parametrize("pct", [True, False]) + @pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] + ]) + def test_rank_object_raises(self, ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(ValueError, + "rank not supported for object dtypes"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From f6ae88a03b78917de66d3557e24db72a6fa35456 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Feb 2018 08:37:52 -0800 Subject: [PATCH 25/35] Raise ValueError in group_rank_object --- pandas/_libs/groupby.pyx | 86 +--------------------------------------- 1 file changed, 1 insertion(+), 85 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index eccfa64812dad..c93b44f979635 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -130,96 +130,12 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) def group_rank_object(ndarray[float64_t, ndim=2] out, ndarray[object, ndim=2] values, ndarray[int64_t] labels, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): - """ - Only transforms on axis=0 - """ - cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 - int64_t grp_na_count=0 - ndarray[int64_t] _as - ndarray[object] _values - bint keep_na - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = ( values).shape - - masked_vals = np.array(values[:, 0], copy=True) - mask = missing.isnaobj(masked_vals) - - if ascending ^ (na_option == 'top'): - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - nan_fill_val = -np.inf - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - try: - _as = np.lexsort(order) - except TypeError: - # lexsort fails when missing data and objects are mixed - # fallback to argsort - _arr = np.asarray(list(zip(order[0], order[1], order[2])), - dtype=[('values', 'O'), ('mask', '?'), - ('labels', 'i8')]) - _as = np.argsort(_arr, kind='mergesort', order=('labels', - 'mask', 'values')) - - if not ascending: - _as = _as[::-1] - - for i in range(N): - dups += 1 - sum_ranks += i - grp_start + 1 - - if keep_na and (values[_as[i], 0] != values[_as[i], 0]): - grp_na_count += 1 - out[_as[i], 0] = np.nan - else: - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - if i == N - 1 or ( - (values[_as[i], 0] != values[_as[i+1], 0]) and not - (values[_as[i], 0] is np.nan and values[_as[i+1], 0] is np.nan) - ): - dups = sum_ranks = 0 - val_start = i - grp_vals_seen += 1 - - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if pct: - for j in range(grp_start, i + 1): - out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1 - - grp_na_count) - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 + raise ValueError("rank not supported for object dtypes") cdef inline float64_t median_linear(float64_t* a, int n) nogil: From caacef2f047060eaf2eb2e81ea7efff71d36d691 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Feb 2018 09:44:17 -0800 Subject: [PATCH 26/35] Used anonymous func for rank wrapper --- pandas/core/groupby.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a86a1565c6eeb..6fa98bc74c294 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2172,15 +2172,6 @@ def get_group_levels(self): # ------------------------------------------------------------ # Aggregation functions - def _group_rank_wrapper(func, *args, **kwargs): - # Need to explicity unpack *args to support Py < 3.5 - # See PEP 448 - return func(args[0], args[1], args[2], args[3], - kwargs.get('ties_method', 'average'), - kwargs.get('ascending', True), - kwargs.get('pct', False), - kwargs.get('na_option', 'keep')) - _cython_functions = { 'aggregate': { 'add': 'group_add', @@ -2207,7 +2198,12 @@ def _group_rank_wrapper(func, *args, **kwargs): 'cummax': 'group_cummax', 'rank': { 'name': 'group_rank', - 'f': _group_rank_wrapper + 'f': lambda func, a, b, c, d, **kwargs: func(a, b, c, d, + kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep') + ) } } } From a315a92b687442c177787de361bb3b262a75f4d2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 10:14:07 -0800 Subject: [PATCH 27/35] Removed group_rank_object --- pandas/_libs/groupby.pyx | 8 -------- pandas/tests/groupby/test_groupby.py | 3 +-- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c93b44f979635..bc2b45e2294e9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -130,14 +130,6 @@ def group_last_object(ndarray[object, ndim=2] out, out[i, j] = resx[i, j] -def group_rank_object(ndarray[float64_t, ndim=2] out, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): - raise ValueError("rank not supported for object dtypes") - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 796b3a7192dbc..51b57ce916279 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2038,8 +2038,7 @@ def test_rank_args_missing(self, grps, vals, ties_method, ascending, def test_rank_object_raises(self, ties_method, ascending, na_option, pct, vals): df = DataFrame({'key': ['foo'] * 5, 'val': vals}) - with tm.assert_raises_regex(ValueError, - "rank not supported for object dtypes"): + with tm.assert_raises_regex(TypeError, "not callable"): df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) From a6ca485561e17e1d3c20883b86f4eac1be9bf06e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 11:00:23 -0800 Subject: [PATCH 28/35] Added comments to groupby_helper --- pandas/_libs/groupby_helper.pxi.in | 41 +++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index f478c0f3fcbeb..7c854f4ebf010 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -470,6 +470,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, keep_na = na_option == 'keep' N, K = ( values).shape + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array masked_vals = np.array(values[:, 0], copy=True) {{if name=='int64'}} mask = (masked_vals == {{nan_val}}).astype(np.uint8) @@ -493,20 +496,47 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, order = (masked_vals, ~mask, labels) np.putmask(masked_vals, mask, nan_fill_val) + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values _as = np.lexsort(order) if not ascending: _as = _as[::-1] with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 + # if keep_na, check for missing values and assign back + # to the result where appropriate if keep_na and masked_vals[_as[i]] == nan_fill_val: grp_na_count += 1 out[_as[i], 0] = nan else: + # this implementation is inefficient because it will + # continue overwriting previously encountered dups + # i.e. if 5 duplicated values are encountered it will + # write to the result as follows (assumes avg tiebreaker): + # 1 + # .5 .5 + # .33 .33 .33 + # .25 .25 .25 .25 + # .2 .2 .2 .2 .2 + # + # could potentially be optimized to only write to the + # result once the last duplicate value is encountered if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups @@ -526,6 +556,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is coming + # up. the conditional also needs to handle nan equality and the + # end of iteration {{if name=='int64'}} if (i == N - 1 or ( (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not @@ -543,7 +578,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, val_start = i grp_vals_seen += 1 - # Move to the next group, cleaning up any values + # Similar to the previous conditional, check now if we are moving to a + # new group. If so, keep track of the index where the new group occurs, + # so the tiebreaker calculations can decrement that from their position + # if the pct flag is True, go back and overwrite the result for + # the group to be divided by the size of the group (excluding na values) if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: for j in range(grp_start, i + 1): From fd29d70417b58f725c5d4b4092ebbc02de5270a7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 11:00:42 -0800 Subject: [PATCH 29/35] Added tests for rank bugs --- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 51b57ce916279..2db772ac54369 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2026,6 +2026,24 @@ def test_rank_args_missing(self, grps, vals, ties_method, ascending, exp_df = DataFrame(exp * len(grps), columns=['val']) assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) + def test_rank_resets_each_group(self, pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + assert_frame_equal(result, exp_df) + + def test_rank_avg_even_vals(self): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + assert_frame_equal(result, exp_df) + @pytest.mark.parametrize("ties_method", [ 'average', 'min', 'max', 'first', 'dense']) @pytest.mark.parametrize("ascending", [True, False]) From b9e471924fe6e7e7691c85373184d679f917a08b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 11:02:31 -0800 Subject: [PATCH 30/35] Fixed issue with ranks not resetting across groups --- pandas/_libs/groupby_helper.pxi.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 7c854f4ebf010..1ce4737150e41 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -588,7 +588,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for j in range(grp_start, i + 1): out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1 - grp_na_count) + dups = sum_ranks = 0 grp_na_count = 0 + val_start = i + 1 grp_start = i + 1 grp_vals_seen = 1 From 613384c592f5a2e068e5f611031671e87de7c1f2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 11:09:51 -0800 Subject: [PATCH 31/35] Changed types; fixed tiebreaker float casting issue --- pandas/_libs/groupby_helper.pxi.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 1ce4737150e41..c1b2a6c6ccaa6 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -457,9 +457,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K - int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, grp_vals_seen=1 - int64_t grp_na_count=0 + Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 ndarray[int64_t] _as ndarray[{{c_type}}] masked_vals ndarray[uint8_t] mask @@ -539,7 +538,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # result once the last duplicate value is encountered if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups + out[_as[j], 0] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = i - grp_start - dups + 2 @@ -583,6 +582,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # so the tiebreaker calculations can decrement that from their position # if the pct flag is True, go back and overwrite the result for # the group to be divided by the size of the group (excluding na values) + # also be sure to reset any of the items helping to calculate dups if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: if pct: for j in range(grp_start, i + 1): From 94a2749c373ec22bfa92c254612f98d44fef1adb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 11:49:28 -0800 Subject: [PATCH 32/35] Documentation cleanup --- pandas/_libs/groupby_helper.pxi.in | 25 +++++++++++++++++++-- pandas/core/groupby.py | 35 +++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index c1b2a6c6ccaa6..f49499af7b894 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -452,8 +452,29 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, ndarray[int64_t] labels, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): - """ - Only transforms on axis=0 + """Provides the rank of values within each group + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of {{c_type}} values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool + unused in this method but provided for call compatability with other + Cython transformations + ties_method : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean + False for ranks by high (1) to low (N) + pct : boolean + Compute percentage rank of data within each group + + Notes + ----- + This method modifies the `out` parameter rather than returning an object """ cdef: TiebreakEnumType tiebreak diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6fa98bc74c294..0363bcd02aa16 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1776,7 +1776,29 @@ def cumcount(self, ascending=True): @Appender(_doc_template) def rank(self, method='average', ascending=True, na_option='keep', pct=False, axis=0): - """Rank within each group""" + """Provides the rank of values within each group + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + method : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Compute percentage rank of data within each group + + Returns + ----- + DataFrame with ranking of values within each group + """ return self._cython_transform('rank', numeric_only=False, ties_method=method, ascending=ascending, na_option=na_option, pct=pct, axis=axis) @@ -2198,11 +2220,12 @@ def get_group_levels(self): 'cummax': 'group_cummax', 'rank': { 'name': 'group_rank', - 'f': lambda func, a, b, c, d, **kwargs: func(a, b, c, d, - kwargs.get('ties_method', 'average'), - kwargs.get('ascending', True), - kwargs.get('pct', False), - kwargs.get('na_option', 'keep') + 'f': lambda func, a, b, c, d, **kwargs: func( + a, b, c, d, + kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep') ) } } From 3ee99c09323419fe3d8744a5622322c0f8b6ca5d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 14:47:22 -0800 Subject: [PATCH 33/35] Removed unused import from groupby.pyx --- pandas/_libs/groupby.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index bc2b45e2294e9..4cd50e80fe748 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -28,8 +28,6 @@ cdef double nan = NaN cdef extern from "numpy/npy_math.h" nogil: bint npy_isnan(double x) -import missing - # TODO: aggregate multiple columns in single pass # ---------------------------------------------------------------------- From b430635d110122d8a08e450f79eb9d3d1ff08bfc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Feb 2018 10:09:29 -0800 Subject: [PATCH 34/35] Removed npy_isnan import --- pandas/_libs/groupby.pyx | 3 --- pandas/_libs/groupby_helper.pxi.in | 12 +----------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4cd50e80fe748..d75c3a71896e3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -25,9 +25,6 @@ cdef int64_t iNaT = get_nat() cdef double NaN = np.NaN cdef double nan = NaN -cdef extern from "numpy/npy_math.h" nogil: - bint npy_isnan(double x) - # TODO: aggregate multiple columns in single pass # ---------------------------------------------------------------------- diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index f49499af7b894..739e8d29f2ca6 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -581,19 +581,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # reset the dups and sum_ranks, knowing that a new value is coming # up. the conditional also needs to handle nan equality and the # end of iteration - {{if name=='int64'}} if (i == N - 1 or ( (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not - (masked_vals[_as[i]] == nan_fill_val and - masked_vals[_as[i+1]] == nan_fill_val - ))): - {{else}} - if (i == N - 1 or ( - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not - (npy_isnan(masked_vals[_as[i]]) and - npy_isnan(masked_vals[_as[i+1]]) - ))): - {{endif}} + (mask[_as[i]] and mask[_as[i+1]]))): dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 From aa4578d78e5d4feaf6a8a3caac51acf7d2ebc112 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Feb 2018 10:33:25 -0800 Subject: [PATCH 35/35] Added grp_sizes array, broke out pct calc --- pandas/_libs/groupby_helper.pxi.in | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 739e8d29f2ca6..b24444c422efa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -481,6 +481,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0 ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes ndarray[{{c_type}}] masked_vals ndarray[uint8_t] mask bint keep_na @@ -489,6 +490,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' N, K = ( values).shape + grp_sizes = np.ones_like(out) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data @@ -588,25 +590,27 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, val_start = i grp_vals_seen += 1 - # Similar to the previous conditional, check now if we are moving to a - # new group. If so, keep track of the index where the new group occurs, - # so the tiebreaker calculations can decrement that from their position - # if the pct flag is True, go back and overwrite the result for - # the group to be divided by the size of the group (excluding na values) - # also be sure to reset any of the items helping to calculate dups + # Similar to the previous conditional, check now if we are moving + # to a new group. If so, keep track of the index where the new + # group occurs, so the tiebreaker calculations can decrement that + # from their position. fill in the size of each group encountered + # (used by pct calculations later). also be sure to reset any of + # the items helping to calculate dups if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if pct: - for j in range(grp_start, i + 1): - out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1 - - grp_na_count) + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count dups = sum_ranks = 0 grp_na_count = 0 val_start = i + 1 grp_start = i + 1 grp_vals_seen = 1 + if pct: + for i in range(N): + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endfor}} + #---------------------------------------------------------------------- # group_min, group_max #----------------------------------------------------------------------