From 85b267a231d9798b41090c1b4f20d4572bb82573 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 16 Feb 2017 18:38:31 +0530 Subject: [PATCH 01/16] Added support for categorical datatype in rank - issue#15420 --- pandas/core/algorithms.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4ae46fe33a5cc..def978bc7e472 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -989,6 +989,10 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) + elif is_categorical_dtype(values): + f = func_map['int64'] + values = _ensure_int64(values.codes) + else: values = _ensure_object(values) From ce90207c0f4de5365f0e184feeda640376adc102 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 16 Feb 2017 19:24:12 +0530 Subject: [PATCH 02/16] BUG: GH#15420 rank for categoricals --- pandas/tests/test_categorical.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cc99cf0f830aa..ccb1aa9c750d1 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4549,6 +4549,13 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) + def test_rank_categorical(self): + exp = pd.Series([1., 2., 3., 4., 5., 6.], name='A') + dframe = pd.DataFrame(['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], columns=['A']) + dframe['A'] = dframe['A'].astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True) + res = dframe['A'].rank() + tm.assert_series_equal(res, exp) class TestCategoricalSubclassing(tm.TestCase): From bf4e36c674061e8881a3b8a5066c0ec7f4c50c1c Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 16 Feb 2017 23:27:23 +0530 Subject: [PATCH 03/16] GH#15420 added support for na_option when ranking categorical --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 7 ++-- pandas/tests/series/test_analytics.py | 53 +++++++++++++++++++++++++++ pandas/tests/test_categorical.py | 8 ---- 4 files changed, 58 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fa24c973a7549..6152567f915c5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -627,3 +627,4 @@ Bug Fixes - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) +- Bug in ``.rank()`` rank incorrectly orders ordered categories diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index def978bc7e472..2a8e3c1d31e13 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -989,9 +989,10 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical_dtype(values): - f = func_map['int64'] - values = _ensure_int64(values.codes) + elif is_categorical_dtype(values) and values.ordered: + nanMapper = np.vectorize(lambda t: np.NaN if t == -1 else t*1.) + f = func_map['float64'] + values = _ensure_float64(nanMapper(values.codes)) else: values = _ensure_object(values) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 222165e9d3633..40891a439c788 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1057,6 +1057,59 @@ def test_rank(self): iranks = iseries.rank() assert_series_equal(iranks, exp) + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = pd.Series([1., 2., 3., 4., 5., 6.]) + exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) + ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] + ) + ordered = ser.astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=True + ) + assert_series_equal(ordered.rank(), exp) + assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = ser.astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=False + ) + res = unordered.rank() + assert_series_equal(res, unordered.astype(object).rank()) + + # Test na_option for rank data + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] + ).astype('category', ).cat.set_categories( + [ + 'first', 'second', 'third', 'fourth', + 'fifth', 'sixth', 'seventh' + ], + ordered=True + ) + + exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.]) + exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.]) + exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN]) + + assert_series_equal( + na_ser.rank(na_option='top'), + exp_top + ) + + assert_series_equal( + na_ser.rank(na_option='bottom'), + exp_bot + ) + + assert_series_equal( + na_ser.rank(na_option='keep'), + exp_keep + ) + def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index ccb1aa9c750d1..9225f8cd281b3 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4549,14 +4549,6 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) - def test_rank_categorical(self): - exp = pd.Series([1., 2., 3., 4., 5., 6.], name='A') - dframe = pd.DataFrame(['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], columns=['A']) - dframe['A'] = dframe['A'].astype('category', ).cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True) - res = dframe['A'].rank() - tm.assert_series_equal(res, exp) - class TestCategoricalSubclassing(tm.TestCase): def test_constructor(self): From 6b709214e65a03ad02d1b86e3f793787700e0105 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 17 Feb 2017 15:38:11 +0530 Subject: [PATCH 04/16] GH#15420 move rank inside categoricals --- pandas/core/algorithms.py | 6 --- pandas/core/categorical.py | 49 ++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 58 ++++++++++++++++++--------- 3 files changed, 87 insertions(+), 26 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2a8e3c1d31e13..a4b0d6cb0723c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -988,12 +988,6 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) - - elif is_categorical_dtype(values) and values.ordered: - nanMapper = np.vectorize(lambda t: np.NaN if t == -1 else t*1.) - f = func_map['float64'] - values = _ensure_float64(nanMapper(values.codes)) - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b6898f11ffa74..e1cda2fda626c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -14,6 +14,7 @@ _coerce_indexer_dtype) from pandas.types.dtypes import CategoricalDtype from pandas.types.common import (_ensure_int64, + _ensure_float64, _ensure_object, _ensure_platform_int, is_dtype_equal, @@ -1404,6 +1405,54 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + def rank(self, method='average', na_option='keep', + ascending=True, pct=False): + """ + Rank the values along a given axis. + + Parameters + ---------- + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + method : {'average', 'min', 'max', 'first', 'dense'}, + default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top + - ``bottom``: replace each NaN with either +/- inf so that they + there are ranked at the bottom + ascending : boolean, default True + Whether or not the elements should be ranked in ascending order. + pct : boolean, default False + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). + """ + from pandas.core.series import Series + if na_option not in ['keep', 'top', 'bottom']: + raise ValueError('invalid na_position: {!r}'.format(na_option)) + + codes = self._codes.copy() + codes = codes.astype(float) + if self._ordered: + na_mask = (codes == -1) + codes[na_mask] = np.nan + codes = _ensure_float64(codes) + ranks = _algos.rank_1d_float64( + codes, ties_method=method, + na_option=na_option, ascending=ascending, pct=pct + ) + else: + values = _ensure_object(self) + ranks = _algos.rank_1d_object( + values, ties_method=method, + na_option=na_option, ascending=ascending, pct=pct + ) + return Series(ranks) + def order(self, inplace=False, ascending=True, na_position='last'): """ DEPRECATED: use :meth:`Categorical.sort_values`. That function diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 40891a439c788..731a473429416 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1057,15 +1057,14 @@ def test_rank(self): iranks = iseries.rank() assert_series_equal(iranks, exp) + def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories - + # Test ascending/descending ranking for ordered categoricals exp = pd.Series([1., 2., 3., 4., 5., 6.]) exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) - ser = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ) - ordered = ser.astype('category', ).cat.set_categories( + ordered = pd.Categorical( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1073,21 +1072,19 @@ def test_rank(self): assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects - unordered = ser.astype('category', ).cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + unord_ser = pd.Series(['first', 'second', 'third', 'fourth']) + unordered = pd.Categorical( + ['first', 'second', 'third', 'fourth'], + ['first', 'second', 'third', 'fourth'], ordered=False ) res = unordered.rank() - assert_series_equal(res, unordered.astype(object).rank()) + assert_series_equal(res, unord_ser.astype(object).rank()) # Test na_option for rank data - na_ser = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype('category', ).cat.set_categories( - [ - 'first', 'second', 'third', 'fourth', - 'fifth', 'sixth', 'seventh' - ], + na_ser = pd.Categorical( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN], + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1095,21 +1092,42 @@ def test_rank(self): exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.]) exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN]) + assert_series_equal(na_ser.rank(na_option='top'), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.]) + exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.]) + exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN]) + assert_series_equal( - na_ser.rank(na_option='top'), + na_ser.rank(na_option='top', ascending=False), exp_top ) - assert_series_equal( - na_ser.rank(na_option='bottom'), + na_ser.rank(na_option='bottom', ascending=False), exp_bot ) - assert_series_equal( - na_ser.rank(na_option='keep'), + na_ser.rank(na_option='keep', ascending=False), exp_keep ) + # Test with pct=True + na_ser = pd.Categorical( + ['first', 'second', 'third', 'fourth', np.NaN], + ['first', 'second', 'third', 'fourth'], + ordered=True + ) + exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) + exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.]) + exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') From 4220e565d83427e4e4352c93d0c77f99760a7a7f Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 18 Feb 2017 13:58:38 +0530 Subject: [PATCH 05/16] BUG: GH15420 - _rank private method on Categorical --- pandas/core/algorithms.py | 56 ++++++++++++++++----------- pandas/core/categorical.py | 54 +++++++------------------- pandas/tests/series/test_analytics.py | 19 +++++---- 3 files changed, 58 insertions(+), 71 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a4b0d6cb0723c..4a706dd556bb3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -598,29 +598,39 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ - Rank the values along a given axis. - - Parameters - ---------- - values : array-like - Array whose values will be ranked. The number of dimensions in this - array must not exceed 2. - axis : int, default 0 - Axis over which to perform rankings. - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - The method by which tiebreaks are broken during the ranking. - na_option : {'keep', 'top'}, default 'keep' - The method by which NaNs are placed in the ranking. - - ``keep``: rank each NaN value with a NaN ranking - - ``top``: replace each NaN with either +/- inf so that they - there are ranked at the top - ascending : boolean, default True - Whether or not the elements should be ranked in ascending order. - pct : boolean, default False - Whether or not to the display the returned rankings in integer form - (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). - """ - if values.ndim == 1: + Compute numerical data ranks (1 through n) along axis. Equal values are + assigned a rank that is the average of the ranks of those values + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + index to direct ranking + method : {'average', 'min', 'max', 'first', 'dense'} + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + numeric_only : boolean, default None + Include only float, int, boolean data. Valid only for DataFrame or + Panel objects + na_option : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Computes percentage rank of data + + Returns + ------- + ranks : same type as caller + """ + if is_categorical(values): + ranks = values._rank(axis=axis, method=method, ascending=ascending, + na_option=na_option, pct=pct) + elif values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) ranks = f(values, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e1cda2fda626c..6deb12bd71452 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -14,7 +14,6 @@ _coerce_indexer_dtype) from pandas.types.dtypes import CategoricalDtype from pandas.types.common import (_ensure_int64, - _ensure_float64, _ensure_object, _ensure_platform_int, is_dtype_equal, @@ -1405,53 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) - def rank(self, method='average', na_option='keep', - ascending=True, pct=False): + def _rank(self, *args, **kwargs): """ - Rank the values along a given axis. + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes. + + Returns + ------- + numpy array - Parameters - ---------- - values : array-like - Array whose values will be ranked. The number of dimensions in this - array must not exceed 2. - method : {'average', 'min', 'max', 'first', 'dense'}, - default 'average' - The method by which tiebreaks are broken during the ranking. - na_option : {'keep', 'top'}, default 'keep' - The method by which NaNs are placed in the ranking. - - ``keep``: rank each NaN value with a NaN ranking - - ``top``: replace each NaN with either +/- inf so that they - there are ranked at the top - - ``bottom``: replace each NaN with either +/- inf so that they - there are ranked at the bottom - ascending : boolean, default True - Whether or not the elements should be ranked in ascending order. - pct : boolean, default False - Whether or not to the display the returned rankings in integer form - (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ - from pandas.core.series import Series - if na_option not in ['keep', 'top', 'bottom']: - raise ValueError('invalid na_position: {!r}'.format(na_option)) + from pandas.core.algorithms import rank - codes = self._codes.copy() - codes = codes.astype(float) if self._ordered: + codes = self._codes.astype('float64') na_mask = (codes == -1) codes[na_mask] = np.nan - codes = _ensure_float64(codes) - ranks = _algos.rank_1d_float64( - codes, ties_method=method, - na_option=na_option, ascending=ascending, pct=pct - ) + ranks = rank(codes, *args, **kwargs) else: - values = _ensure_object(self) - ranks = _algos.rank_1d_object( - values, ties_method=method, - na_option=na_option, ascending=ascending, pct=pct - ) - return Series(ranks) + ranks = rank(self.astype('object'), *args, **kwargs) + return ranks def order(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 731a473429416..5a9b8bfd9754c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1063,8 +1063,9 @@ def test_rank_categorical(self): # Test ascending/descending ranking for ordered categoricals exp = pd.Series([1., 2., 3., 4., 5., 6.]) exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) - ordered = pd.Categorical( + ordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype('category').cat.set_categories( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1072,18 +1073,19 @@ def test_rank_categorical(self): assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects - unord_ser = pd.Series(['first', 'second', 'third', 'fourth']) - unordered = pd.Categorical( - ['first', 'second', 'third', 'fourth'], - ['first', 'second', 'third', 'fourth'], + unordered = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype('category').cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False ) res = unordered.rank() - assert_series_equal(res, unord_ser.astype(object).rank()) + assert_series_equal(res, unordered.astype(object).rank()) # Test na_option for rank data - na_ser = pd.Categorical( + na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN], + ).astype('category').cat.set_categories( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1115,8 +1117,9 @@ def test_rank_categorical(self): ) # Test with pct=True - na_ser = pd.Categorical( + na_ser = pd.Series( ['first', 'second', 'third', 'fourth', np.NaN], + ).astype('category').cat.set_categories( ['first', 'second', 'third', 'fourth'], ordered=True ) From 9a6b5cdc14cf3ba5f4a207a548f54de192ea56e9 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 18 Feb 2017 14:02:19 +0530 Subject: [PATCH 06/16] BUG: GH15420 - _rank private method on Categorical --- pandas/core/algorithms.py | 41 ++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a706dd556bb3..8707d0a106d7b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -598,35 +598,28 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ - Compute numerical data ranks (1 through n) along axis. Equal values are - assigned a rank that is the average of the ranks of those values + Rank the values along a given axis. Parameters ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - index to direct ranking - method : {'average', 'min', 'max', 'first', 'dense'} - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups - numeric_only : boolean, default None - Include only float, int, boolean data. Valid only for DataFrame or - Panel objects - na_option : {'keep', 'top', 'bottom'} - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + axis : int, default 0 + Axis over which to perform rankings. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top ascending : boolean, default True - False for ranks by high (1) to low (N) + Whether or not the elements should be ranked in ascending order. pct : boolean, default False - Computes percentage rank of data - - Returns - ------- - ranks : same type as caller - """ + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). + """ if is_categorical(values): ranks = values._rank(axis=axis, method=method, ascending=ascending, na_option=na_option, pct=pct) From fa0b4c24c4b8af66ccbfcdf81894ead6b7389c12 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 18 Feb 2017 14:04:37 +0530 Subject: [PATCH 07/16] BUG: GH15420 - _rank private method on Categorical --- pandas/core/algorithms.py | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8707d0a106d7b..253f522b8ee45 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -598,27 +598,27 @@ def mode(values): def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ - Rank the values along a given axis. - - Parameters - ---------- - values : array-like - Array whose values will be ranked. The number of dimensions in this - array must not exceed 2. - axis : int, default 0 - Axis over which to perform rankings. - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - The method by which tiebreaks are broken during the ranking. - na_option : {'keep', 'top'}, default 'keep' - The method by which NaNs are placed in the ranking. - - ``keep``: rank each NaN value with a NaN ranking - - ``top``: replace each NaN with either +/- inf so that they - there are ranked at the top - ascending : boolean, default True - Whether or not the elements should be ranked in ascending order. - pct : boolean, default False - Whether or not to the display the returned rankings in integer form - (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). + Rank the values along a given axis. + + Parameters + ---------- + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + axis : int, default 0 + Axis over which to perform rankings. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top + ascending : boolean, default True + Whether or not the elements should be ranked in ascending order. + pct : boolean, default False + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if is_categorical(values): ranks = values._rank(axis=axis, method=method, ascending=ascending, From fbaba1b8bb47df00775ace95a693b70982023c25 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 18 Feb 2017 21:36:02 +0530 Subject: [PATCH 08/16] return values for rank from categorical object --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/algorithms.py | 8 ++++---- pandas/core/categorical.py | 17 ++++++----------- pandas/tests/series/test_analytics.py | 3 ++- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6152567f915c5..0b501adba5039 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -578,6 +578,7 @@ Bug Fixes +- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) @@ -627,4 +628,3 @@ Bug Fixes - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) -- Bug in ``.rank()`` rank incorrectly orders ordered categories diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 253f522b8ee45..fa7052068ad9c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -620,10 +620,7 @@ def rank(values, axis=0, method='average', na_option='keep', Whether or not to the display the returned rankings in integer form (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ - if is_categorical(values): - ranks = values._rank(axis=axis, method=method, ascending=ascending, - na_option=na_option, pct=pct) - elif values.ndim == 1: + if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) ranks = f(values, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) @@ -991,6 +988,9 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) + elif is_categorical(values) and values._ordered: + f = func_map['float64'] + values = values._values_for_rank() else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6deb12bd71452..f713401c8cba3 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1404,28 +1404,23 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) - def _rank(self, *args, **kwargs): + def _values_for_rank(self): """ For correctly ranking ordered categorical data. See GH#15420 Ordered categorical data should be ranked on the basis of - codes. + codes with -1 translated to NaN. Returns ------- numpy array """ - from pandas.core.algorithms import rank - + values = self._codes.astype('float64') if self._ordered: - codes = self._codes.astype('float64') - na_mask = (codes == -1) - codes[na_mask] = np.nan - ranks = rank(codes, *args, **kwargs) - else: - ranks = rank(self.astype('object'), *args, **kwargs) - return ranks + na_mask = (values == -1) + values[na_mask] = np.nan + return values def order(self, inplace=False, ascending=True, na_position='last'): """ diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 5a9b8bfd9754c..8b6dd47675cc4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1079,8 +1079,9 @@ def test_rank_categorical(self): ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False ) + exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) res = unordered.rank() - assert_series_equal(res, unordered.astype(object).rank()) + assert_series_equal(res, exp_unordered) # Test na_option for rank data na_ser = pd.Series( From ef999c3e87322ce042351b83f33d9168c580699a Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 17:18:48 +0530 Subject: [PATCH 09/16] merged with upstream master --- pandas/core/algorithms.py | 7 +++++-- pandas/core/categorical.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fa7052068ad9c..c8b113b52bc09 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -988,9 +988,12 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical(values) and values._ordered: - f = func_map['float64'] + elif is_categorical_dtype(values): values = values._values_for_rank() + if is_float_dtype(values): + f = func_map['float64'] + else: + f = func_map['object'] else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f713401c8cba3..b88a6b171b316 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1416,10 +1416,14 @@ def _values_for_rank(self): numpy array """ - values = self._codes.astype('float64') - if self._ordered: - na_mask = (values == -1) - values[na_mask] = np.nan + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype('float64') + values[mask] = np.nan + else: + values = np.array(self) return values def order(self, inplace=False, ascending=True, na_position='last'): From 5e5bbebcced0bb862be2fbcbbfdda23d8fbefe15 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 16 Feb 2017 19:24:12 +0530 Subject: [PATCH 10/16] BUG: GH#15420 rank for categoricals --- pandas/tests/test_categorical.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 9225f8cd281b3..ccb1aa9c750d1 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4549,6 +4549,14 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) + def test_rank_categorical(self): + exp = pd.Series([1., 2., 3., 4., 5., 6.], name='A') + dframe = pd.DataFrame(['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], columns=['A']) + dframe['A'] = dframe['A'].astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True) + res = dframe['A'].rank() + tm.assert_series_equal(res, exp) + class TestCategoricalSubclassing(tm.TestCase): def test_constructor(self): From 049c0fc3653c90081a6ea7da398b277d017db415 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Thu, 16 Feb 2017 23:27:23 +0530 Subject: [PATCH 11/16] GH#15420 added support for na_option when ranking categorical --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/series/test_analytics.py | 18 ++++++++++++++---- pandas/tests/test_categorical.py | 8 -------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0b501adba5039..666790f1d64a8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -628,3 +628,4 @@ Bug Fixes - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) +- Bug in ``.rank()`` rank incorrectly orders ordered categories diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 8b6dd47675cc4..480c7a63355fb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1059,13 +1059,14 @@ def test_rank(self): def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories - + # Test ascending/descending ranking for ordered categoricals exp = pd.Series([1., 2., 3., 4., 5., 6.]) exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) - ordered = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], - ).astype('category').cat.set_categories( + ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] + ) + ordered = ser.astype('category', ).cat.set_categories( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1088,6 +1089,15 @@ def test_rank_categorical(self): ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN], ).astype('category').cat.set_categories( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + + # Test na_option for rank data + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] + ).astype('category', ).cat.set_categories( + [ + 'first', 'second', 'third', 'fourth', + 'fifth', 'sixth', 'seventh' + ], ordered=True ) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index ccb1aa9c750d1..9225f8cd281b3 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4549,14 +4549,6 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) - def test_rank_categorical(self): - exp = pd.Series([1., 2., 3., 4., 5., 6.], name='A') - dframe = pd.DataFrame(['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], columns=['A']) - dframe['A'] = dframe['A'].astype('category', ).cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True) - res = dframe['A'].rank() - tm.assert_series_equal(res, exp) - class TestCategoricalSubclassing(tm.TestCase): def test_constructor(self): From 40d88c1e676eec5fe96e7c0556546b9138723010 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 18 Feb 2017 21:36:02 +0530 Subject: [PATCH 12/16] return values for rank from categorical object --- doc/source/whatsnew/v0.20.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 666790f1d64a8..0b501adba5039 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -628,4 +628,3 @@ Bug Fixes - Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) -- Bug in ``.rank()`` rank incorrectly orders ordered categories From f8ec01968629558ebecbe7c347990c5514c2c122 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 11:59:56 +0530 Subject: [PATCH 13/16] ask Categorical for ranking function --- pandas/core/categorical.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b88a6b171b316..a2f3b0b1b8a4e 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1404,6 +1404,25 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + def _funcs_for_rank(self): + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN, as floats, and unordered + as objects + + Returns + ------- + numpy array + + """ + if self._ordered: + f = _algos.rank_1d_float64 + else: + f = _algos.rank_1d_object + return f + def _values_for_rank(self): """ For correctly ranking ordered categorical data. See GH#15420 From c43a029c25de44bb048f4522e0bb50f7e207e9b1 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 13:24:48 +0530 Subject: [PATCH 14/16] using if/else construct to pick sorting function for categoricals --- pandas/core/categorical.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a2f3b0b1b8a4e..b88a6b171b316 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1404,25 +1404,6 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) - def _funcs_for_rank(self): - """ - For correctly ranking ordered categorical data. See GH#15420 - - Ordered categorical data should be ranked on the basis of - codes with -1 translated to NaN, as floats, and unordered - as objects - - Returns - ------- - numpy array - - """ - if self._ordered: - f = _algos.rank_1d_float64 - else: - f = _algos.rank_1d_object - return f - def _values_for_rank(self): """ For correctly ranking ordered categorical data. See GH#15420 From 3ba4e3ac49d187a4fd3b4404c223adceacc7b6c3 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 17:48:20 +0530 Subject: [PATCH 15/16] corrections after rebasing --- pandas/tests/series/test_analytics.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 480c7a63355fb..f18ae438f5262 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1063,10 +1063,9 @@ def test_rank_categorical(self): # Test ascending/descending ranking for ordered categoricals exp = pd.Series([1., 2., 3., 4., 5., 6.]) exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) - ser = pd.Series( + ordered = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ) - ordered = ser.astype('category', ).cat.set_categories( + ).astype('category', ).cat.set_categories( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) @@ -1084,12 +1083,6 @@ def test_rank_categorical(self): res = unordered.rank() assert_series_equal(res, exp_unordered) - # Test na_option for rank data - na_ser = pd.Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN], - ).astype('category').cat.set_categories( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], - # Test na_option for rank data na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] From a7e573bb2dbbd9f06aebb5e5aa83dc553d7cdae6 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Fri, 24 Feb 2017 18:49:30 +0530 Subject: [PATCH 16/16] moved test for categorical, in rank, to top --- pandas/core/algorithms.py | 10 ++++------ pandas/tests/series/test_analytics.py | 2 +- pandas/tests/test_categorical.py | 1 + 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c8b113b52bc09..b11927a80fb2e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None): def _get_data_algo(values, func_map): f = None + + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -988,12 +992,6 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical_dtype(values): - values = values._values_for_rank() - if is_float_dtype(values): - f = func_map['float64'] - else: - f = func_map['object'] else: values = _ensure_object(values) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index f18ae438f5262..b092e4f084767 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1059,7 +1059,7 @@ def test_rank(self): def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories - + # Test ascending/descending ranking for ordered categoricals exp = pd.Series([1., 2., 3., 4., 5., 6.]) exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 9225f8cd281b3..cc99cf0f830aa 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4549,6 +4549,7 @@ def test_concat_categorical(self): 'h': [None] * 6 + cat_values}) tm.assert_frame_equal(res, exp) + class TestCategoricalSubclassing(tm.TestCase): def test_constructor(self):