From 3b7831a8e5d3781041dc007638a3988e1123f1bb Mon Sep 17 00:00:00 2001 From: Nicolas Bonnotte Date: Wed, 1 Jun 2016 21:12:43 +0200 Subject: [PATCH 1/3] BUG in DataFrameGroupBy.rank returning empty frame #11759 fixes #11759 --- pandas/core/groupby.py | 24 +++++++++++++++++++++++- pandas/tests/test_groupby.py | 20 ++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6179857978b7b..356a353120828 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -76,7 +76,7 @@ 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', 'resample', 'describe', - 'rank', 'quantile', + 'quantile', 'fillna', 'mad', 'any', 'all', @@ -1378,6 +1378,27 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform('cumsum') + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, axis=0, method='average', numeric_only=True, + na_option='keep', ascending=True, pct=False): + """Compute numerical data ranks (1 through n) along axis. + """ + + if numeric_only: + data = self._obj_with_exclusions._get_numeric_data() + if data.size == 0: + raise DataError('No numeric types to aggregate') + data = data.groupby(self.grouper) + else: + data = self + + def wrapper(values): + return values.rank(axis=axis, method=method, na_option=na_option, + ascending=ascending, pct=pct) + + return data.transform(wrapper) + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -3182,6 +3203,7 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate def _aggregate_generic(self, func, *args, **kwargs): + if self.grouper.nkeys != 1: raise AssertionError('Number of keys must be 1') diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3f5b4152afe31..a4c2bf75aff8d 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3646,6 +3646,24 @@ def test_column_select_via_attr(self): expected = self.df.groupby('A').agg(np.mean) assert_frame_equal(result, expected) + def test_rank(self): + # GH 11759 + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': ['B1', 'B1', 'B2'], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + self.assertRaises(DataError, dg.rank, method='first') + + # with another numeric column + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': ['B1', 'B1', 'B2'], + 'c': 1., + 'd': 1.}) + df = df.set_index('a') + expected = df.drop('b', axis=1).groupby('c').rank(method='first') + assert_frame_equal(df.groupby('c').rank(method='first'), expected) + def test_rank_apply(self): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) @@ -5753,7 +5771,6 @@ def test_groupby_whitelist(self): 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'fillna', 'mad', @@ -5794,7 +5811,6 @@ def test_groupby_whitelist(self): 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'fillna', 'mad', From 6f1e356a27690f6981cdf1eb4b78d0efa83591b4 Mon Sep 17 00:00:00 2001 From: Nicolas Bonnotte Date: Wed, 8 Jun 2016 22:30:11 +0200 Subject: [PATCH 2/3] More tests --- pandas/core/groupby.py | 9 ++++++++- pandas/tests/test_groupby.py | 21 +++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 356a353120828..179929fa01e7b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1397,7 +1397,14 @@ def wrapper(values): return values.rank(axis=axis, method=method, na_option=na_option, ascending=ascending, pct=pct) - return data.transform(wrapper) + try: + return data.transform(wrapper) + except ValueError: + if not numeric_only and method=='first': + raise ValueError('first not supported for non-numeric data') + # such a ValueError is raised by pandas.algos.rank_2d_generic + # for regular (non-grouped) dataframes + @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a4c2bf75aff8d..79b29a46de6df 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3653,7 +3653,14 @@ def test_rank(self): 'c': 1.}) df = df.set_index('a') dg = df.groupby('c') - self.assertRaises(DataError, dg.rank, method='first') + self.assertRaises(DataError, dg.rank, + method='first') + self.assertRaises(DataError, dg.rank, + method='first', numeric_only=True) + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + # such a ValueError is raised by pandas.algos.rank_2d_generic + # for regular (non-grouped) dataframes # with another numeric column df = DataFrame({'a': ['A1', 'A1', 'A1'], @@ -3661,8 +3668,18 @@ def test_rank(self): 'c': 1., 'd': 1.}) df = df.set_index('a') + dg = df.groupby('c') expected = df.drop('b', axis=1).groupby('c').rank(method='first') - assert_frame_equal(df.groupby('c').rank(method='first'), expected) + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + # same remark as above def test_rank_apply(self): lev1 = tm.rands_array(10, 100) From f432b2b51e3fe881c51bfce817b026bf74d2e270 Mon Sep 17 00:00:00 2001 From: Nicolas Bonnotte Date: Sat, 18 Jun 2016 16:13:21 +0200 Subject: [PATCH 3/3] More tests: categorical and datetime --- pandas/core/groupby.py | 20 ++++++------- pandas/tests/test_groupby.py | 57 ++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 179929fa01e7b..f529af6543939 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1385,26 +1385,24 @@ def rank(self, axis=0, method='average', numeric_only=True, """Compute numerical data ranks (1 through n) along axis. """ - if numeric_only: - data = self._obj_with_exclusions._get_numeric_data() - if data.size == 0: - raise DataError('No numeric types to aggregate') - data = data.groupby(self.grouper) - else: - data = self - def wrapper(values): return values.rank(axis=axis, method=method, na_option=na_option, ascending=ascending, pct=pct) try: - return data.transform(wrapper) + return self.transform(wrapper) except ValueError: - if not numeric_only and method=='first': + if not numeric_only and method == 'first': raise ValueError('first not supported for non-numeric data') # such a ValueError is raised by pandas.algos.rank_2d_generic # for regular (non-grouped) dataframes - + if numeric_only: + data = self._obj_with_exclusions._get_numeric_data() + if data.size == 0: + raise DataError('No numeric types to aggregate') + data = data.groupby(self.grouper) + return data.transform(wrapper) + raise @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 79b29a46de6df..ba0b343aa49ce 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3647,9 +3647,28 @@ def test_column_select_via_attr(self): assert_frame_equal(result, expected) def test_rank(self): - # GH 11759 + # normal behavior df = DataFrame({'a': ['A1', 'A1', 'A1'], - 'b': ['B1', 'B1', 'B2'], + 'b': [2, 1, 1], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + expected = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': [3., 1., 2.]}) + expected = expected.set_index('a') + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=False) + assert_frame_equal(result, expected) + + # GH 11759: non numeric data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': ['B2', 'B1', 'B1'], 'c': 1.}) df = df.set_index('a') dg = df.groupby('c') @@ -3662,9 +3681,41 @@ def test_rank(self): # such a ValueError is raised by pandas.algos.rank_2d_generic # for regular (non-grouped) dataframes + # with categorical data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': Categorical(['big', 'small', 'small'], + categories=['small', 'big'], + ordered=True), + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + self.assertRaises(DataError, dg.rank, + method='first') + self.assertRaises(DataError, dg.rank, + method='first', numeric_only=True) + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + + # with datetime data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': [datetime(2002, 2, 2), datetime(2001, 1, 1), + datetime(2001, 1, 1)], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=False) + assert_frame_equal(result, expected) + # with another numeric column df = DataFrame({'a': ['A1', 'A1', 'A1'], - 'b': ['B1', 'B1', 'B2'], + 'b': ['B2', 'B1', 'B1'], 'c': 1., 'd': 1.}) df = df.set_index('a')